From 40642132f3ef0d9b946c279b371671d8b59fff14 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 8 Nov 2023 19:30:49 +0100 Subject: [PATCH 01/14] [gpucpp] include Olivier's upstream "fix issue for large color matrix where a index issue was not spotted" for #781 I will regenerate and run tests --- MG5aMC/mg5amcnlo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index 49c93e01b8..d7a466dd54 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit 49c93e01b8596cbdb4e65f628601de1e6f08c744 +Subproject commit d7a466dd54bb2f57564f5cc674f129ebf095c969 From 8c654cf0d35c332e3f4449301f8a8758cc3efce5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 8 Nov 2023 19:43:13 +0100 Subject: [PATCH 02/14] [gpucpp] in CODEGEN output.py, add run_card_class to avoid crashes after Olivier's commit 8a18cc242 "better handling of the run_card" ./MG5_debug:AttributeError: 'PLUGIN_ProcessExporter' object has no attribute 'run_card_class' --- epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index 8961036fb1..e3f88719f2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -149,6 +149,9 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): ###helas_exporter = None helas_exporter = model_handling.PLUGIN_GPUFOHelasCallWriter # this is one of the main fixes for issue #341! + # AV 08 Nov 2023 add run_card_class to avoid crashes after Olivier's commit 8a18cc242 "better handling of the run_card" + run_card_class = None + # AV (default from OM's tutorial) - add a debug printout def __init__(self, *args, **kwargs): self.in_madevent_mode = False # see MR #747 From e20455c5070f97bc41bf4767ce97242d06ed0b21 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 8 Nov 2023 20:46:46 +0100 Subject: [PATCH 03/14] [gpucpp] regenerate all 15 processes after Olivier's latest upstream changes, which should fix #781 Apart from codegen logs, there are changes in banner.py, but also one change in matrix1.f for ggttggg. This may indeed be the fix for #781 --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 36 +-- .../ee_mumu.mad/bin/internal/banner.py | 8 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 26 +-- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 36 +-- .../cudacpp/gg_tt.mad/bin/internal/banner.py | 8 +- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 24 +- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 40 ++-- .../gg_tt01g.mad/bin/internal/banner.py | 8 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 36 +-- .../cudacpp/gg_ttg.mad/bin/internal/banner.py | 8 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 26 +-- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 34 +-- .../gg_ttgg.mad/bin/internal/banner.py | 8 +- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 30 +-- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 38 ++-- .../SubProcesses/P1_gg_ttxggg/matrix1.f | 164 +++++++------- .../gg_ttggg.mad/bin/internal/banner.py | 8 +- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 30 +-- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 40 ++-- .../cudacpp/gq_ttq.mad/bin/internal/banner.py | 8 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 36 +-- .../CODEGEN_cudacpp_heft_gg_h_log.txt | 20 +- .../CODEGEN_mad_pp_tt012j_log.txt | 208 +++++++++--------- .../pp_tt012j.mad/bin/internal/banner.py | 8 +- 24 files changed, 452 insertions(+), 436 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 16a5e3cdc9..d5d0a77b77 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005366802215576172  +DEBUG: model prefixing takes 0.005647420883178711  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,17 +154,17 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.005 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  @@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -183,27 +183,27 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  WARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.102 s +Wrote files for 8 helas calls in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.201 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 3 routines in 0.203 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.258 s +ALOHA: aloha creates 7 routines in 0.267 s FFV1 FFV1 FFV2 @@ -226,7 +226,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -241,16 +241,16 @@ patching file matrix1.f Hunk #3 succeeded at 230 (offset 9 lines). Hunk #4 succeeded at 267 (offset 18 lines). Hunk #5 succeeded at 312 (offset 18 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 235]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m1.973s -user 0m1.681s -sys 0m0.231s +real 0m2.189s +user 0m1.653s +sys 0m0.232s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py index ef1bf58979..f0d38c2e5a 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py @@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt): except Exception as error: import launch_plugin target_class = launch_plugin.RunCard - + elif issubclass(finput, RunCard): + target_class = finput else: return None @@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False, if python_template and not to_write: import string if self.blocks: - text = string.Template(text) mapping = {} for b in self.blocks: mapping[b.name] = b.get_template(self) - text = text.substitute(mapping) + if "$%s" % b.name not in text: + text += "\n$%s\n" % b.name + text = string.Template(text).substitute(mapping) if not self.list_parameter: text = text % self diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index d48a5c4d44..ccb39ba2cc 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005533456802368164  +DEBUG: model prefixing takes 0.005671501159667969  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -160,28 +160,28 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  +DEBUG: type(subproc_group)= [output.py at line 193]  +DEBUG: type(fortran_model)= [output.py at line 194]  +DEBUG: type(me)= me=0 [output.py at line 195]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.271 s +ALOHA: aloha creates 4 routines in 0.272 s FFV1 FFV1 FFV2 @@ -198,9 +198,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  quit -real 0m0.669s -user 0m0.609s -sys 0m0.053s +real 0m0.795s +user 0m0.698s +sys 0m0.066s diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 2460cf072a..b0eb76c9f4 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005415439605712891  +DEBUG: model prefixing takes 0.005650758743286133  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.009 s Total: 1 processes with 3 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -184,23 +184,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.106 s +Wrote files for 10 helas calls in 0.107 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.148 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 2 routines in 0.152 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.134 s +ALOHA: aloha creates 4 routines in 0.140 s VVV1 FFV1 FFV1 @@ -219,7 +219,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -230,16 +230,16 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S patching file auto_dsig1.f patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 235]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m1.735s -user 0m1.507s -sys 0m0.213s +real 0m1.780s +user 0m1.544s +sys 0m0.218s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py index ef1bf58979..f0d38c2e5a 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py @@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt): except Exception as error: import launch_plugin target_class = launch_plugin.RunCard - + elif issubclass(finput, RunCard): + target_class = finput else: return None @@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False, if python_template and not to_write: import string if self.blocks: - text = string.Template(text) mapping = {} for b in self.blocks: mapping[b.name] = b.get_template(self) - text = text.substitute(mapping) + if "$%s" % b.name not in text: + text += "\n$%s\n" % b.name + text = string.Template(text).substitute(mapping) if not self.list_parameter: text = text % self diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index f9425b6b07..27709b8f4f 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057506561279296875  +DEBUG: model prefixing takes 0.005872249603271484  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -161,22 +161,22 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  +DEBUG: type(subproc_group)= [output.py at line 193]  +DEBUG: type(fortran_model)= [output.py at line 194]  +DEBUG: type(me)= me=0 [output.py at line 195]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines @@ -193,9 +193,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  quit -real 0m0.684s -user 0m0.481s -sys 0m0.057s +real 0m0.565s +user 0m0.498s +sys 0m0.040s diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 2db08eff10..0eefbc9b91 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005487680435180664  +DEBUG: model prefixing takes 0.005677461624145508  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,10 +170,10 @@ Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  @@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,15 +194,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -211,14 +211,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3] [export_cpp.py at line 711]  DEBUG: subproc_number =  1 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s -Wrote files for 46 helas calls in 0.247 s +Wrote files for 46 helas calls in 0.249 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines @@ -226,7 +226,7 @@ ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 ALOHA: aloha creates 5 routines in 0.331 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines @@ -257,7 +257,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -276,16 +276,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 235]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m2.331s -user 0m2.084s -sys 0m0.240s +real 0m2.345s +user 0m2.078s +sys 0m0.243s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py index ef1bf58979..f0d38c2e5a 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py @@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt): except Exception as error: import launch_plugin target_class = launch_plugin.RunCard - + elif issubclass(finput, RunCard): + target_class = finput else: return None @@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False, if python_template and not to_write: import string if self.blocks: - text = string.Template(text) mapping = {} for b in self.blocks: mapping[b.name] = b.get_template(self) - text = text.substitute(mapping) + if "$%s" % b.name not in text: + text += "\n$%s\n" % b.name + text = string.Template(text).substitute(mapping) if not self.list_parameter: text = text % self diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 5643c4439c..740186af78 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005489349365234375  +DEBUG: model prefixing takes 0.005747556686401367  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -162,10 +162,10 @@ Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -184,29 +184,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -Wrote files for 36 helas calls in 0.151 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s +Wrote files for 36 helas calls in 0.152 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.330 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.332 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.314 s +ALOHA: aloha creates 10 routines in 0.326 s VVV1 VVV1 FFV1 @@ -230,7 +230,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -245,16 +245,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 235]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m2.275s -user 0m1.969s -sys 0m0.229s +real 0m2.221s +user 0m1.964s +sys 0m0.245s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py index ef1bf58979..f0d38c2e5a 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py @@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt): except Exception as error: import launch_plugin target_class = launch_plugin.RunCard - + elif issubclass(finput, RunCard): + target_class = finput else: return None @@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False, if python_template and not to_write: import string if self.blocks: - text = string.Template(text) mapping = {} for b in self.blocks: mapping[b.name] = b.get_template(self) - text = text.substitute(mapping) + if "$%s" % b.name not in text: + text += "\n$%s\n" % b.name + text = string.Template(text).substitute(mapping) if not self.list_parameter: text = text % self diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 6c3bb7fa30..f795e1428d 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00565791130065918  +DEBUG: model prefixing takes 0.0055065155029296875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -161,29 +161,29 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  +DEBUG: type(subproc_group)= [output.py at line 193]  +DEBUG: type(fortran_model)= [output.py at line 194]  +DEBUG: type(me)= me=0 [output.py at line 195]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.331 s +ALOHA: aloha creates 5 routines in 0.328 s VVV1 VVV1 FFV1 @@ -201,9 +201,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  quit -real 0m0.836s -user 0m0.731s -sys 0m0.060s +real 0m0.870s +user 0m0.728s +sys 0m0.055s diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 2401636ea2..374e4defbb 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005415916442871094  +DEBUG: model prefixing takes 0.005505084991455078  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -162,10 +162,10 @@ Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -184,22 +184,22 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.446 s -Wrote files for 222 helas calls in 0.728 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.431 s +Wrote files for 222 helas calls in 0.704 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.337 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.335 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines @@ -233,7 +233,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -248,16 +248,16 @@ Hunk #2 succeeded at 191 (offset 48 lines). Hunk #3 succeeded at 269 (offset 48 lines). Hunk #4 succeeded at 297 (offset 48 lines). Hunk #5 succeeded at 342 (offset 48 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 235]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m3.354s -user 0m3.128s -sys 0m0.221s +real 0m3.310s +user 0m3.061s +sys 0m0.239s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py index ef1bf58979..f0d38c2e5a 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py @@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt): except Exception as error: import launch_plugin target_class = launch_plugin.RunCard - + elif issubclass(finput, RunCard): + target_class = finput else: return None @@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False, if python_template and not to_write: import string if self.blocks: - text = string.Template(text) mapping = {} for b in self.blocks: mapping[b.name] = b.get_template(self) - text = text.substitute(mapping) + if "$%s" % b.name not in text: + text += "\n$%s\n" % b.name + text = string.Template(text).substitute(mapping) if not self.list_parameter: text = text % self diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index d29fe4c726..b1a7fdc7e4 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005518913269042969  +DEBUG: model prefixing takes 0.005366086959838867  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.160 s +1 processes with 123 diagrams generated in 0.161 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  +DEBUG: type(subproc_group)= [output.py at line 193]  +DEBUG: type(fortran_model)= [output.py at line 194]  +DEBUG: type(me)= me=0 [output.py at line 195]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.430 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.431 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.324 s +ALOHA: aloha creates 5 routines in 0.325 s VVV1 VVV1 FFV1 @@ -204,9 +204,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  quit -real 0m1.541s -user 0m1.392s -sys 0m0.062s +real 0m1.466s +user 0m1.388s +sys 0m0.064s diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index cd9806264d..af1d671efc 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005460023880004883  +DEBUG: model prefixing takes 0.005596160888671875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.902 s +1 processes with 1240 diagrams generated in 1.921 s Total: 1 processes with 1240 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -186,29 +186,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.662 s -Wrote files for 2281 helas calls in 18.810 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.769 s +Wrote files for 2281 helas calls in 18.847 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.319 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.320 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.314 s +ALOHA: aloha creates 10 routines in 0.317 s VVV1 VVV1 FFV1 @@ -235,7 +235,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -250,16 +250,16 @@ Hunk #2 succeeded at 255 (offset 112 lines). Hunk #3 succeeded at 333 (offset 112 lines). Hunk #4 succeeded at 361 (offset 112 lines). Hunk #5 succeeded at 406 (offset 112 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 235]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m29.634s -user 0m29.131s -sys 0m0.396s +real 0m29.796s +user 0m29.282s +sys 0m0.413s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f index b8a6a894de..ac5285eda5 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f @@ -17540,7 +17540,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) TMP_JAMP(2914) = TMP_JAMP(2351) + TMP_JAMP(1665) ! used 2 times TMP_JAMP(2913) = TMP_JAMP(2310) + TMP_JAMP(2134) ! used 2 times TMP_JAMP(2912) = TMP_JAMP(2073) + ((-0.000000000000000D+00 - $ ,1.000000000000000D+00)) * AMP(1483) ! used 2 times + $ ,1.000000000000000D+00)) * AMP(1481) ! used 2 times TMP_JAMP(3030) = TMP_JAMP(2935) + ((0.000000000000000D+00, $ -1.000000000000000D+00)) * TMP_JAMP(1044) ! used 2 times TMP_JAMP(3029) = TMP_JAMP(2934) - TMP_JAMP(329) ! used 2 times @@ -17688,7 +17688,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +TMP_JAMP(360)+TMP_JAMP(485)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(558)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(576)+((0.000000000000000D+00 - $ ,1.000000000000000D+00))*AMP(1489)+(-1.000000000000000D+00) + $ ,1.000000000000000D+00))*AMP(1485)+(-1.000000000000000D+00) $ *TMP_JAMP(2911)+(-1.000000000000000D+00)*TMP_JAMP(2916)+( $ -1.000000000000000D+00)*TMP_JAMP(2971)+TMP_JAMP(2994) JAMP(2,1) = (-1.000000000000000D+00)*AMP(242)+( @@ -17698,7 +17698,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(557)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(576)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(1580)+((0.000000000000000D+00,1.000000000000000D+00)) - $ *AMP(1480)+TMP_JAMP(2655)+(-1.000000000000000D+00) + $ *AMP(1476)+TMP_JAMP(2655)+(-1.000000000000000D+00) $ *TMP_JAMP(2913)+(-1.000000000000000D+00)*TMP_JAMP(2940) JAMP(3,1) = (-1.000000000000000D+00)*AMP(250)+( $ -1.000000000000000D+00)*TMP_JAMP(484)+((0.000000000000000D+00 @@ -17715,7 +17715,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00))*TMP_JAMP(575)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1589)+TMP_JAMP(1693) $ +TMP_JAMP(2050)+((0.000000000000000D+00,1.000000000000000D+00)) - $ *AMP(1471)+(-1.000000000000000D+00)*TMP_JAMP(2353) + $ *AMP(1467)+(-1.000000000000000D+00)*TMP_JAMP(2353) $ +TMP_JAMP(2659)+TMP_JAMP(2905)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(2955)+TMP_JAMP(2960) JAMP(5,1) = (-1.000000000000000D+00)*AMP(241) @@ -17919,7 +17919,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(25,1) = (-1.000000000000000D+00)*TMP_JAMP(360) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(454) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(517) - $ +(-1.000000000000000D+00)*AMP(976)+(-1.000000000000000D+00) + $ +(-1.000000000000000D+00)*AMP(974)+(-1.000000000000000D+00) $ *TMP_JAMP(1843)+TMP_JAMP(1859)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(2085)+TMP_JAMP(2104)+( $ -1.000000000000000D+00)*TMP_JAMP(2662)+TMP_JAMP(2851) @@ -17929,7 +17929,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00))*TMP_JAMP(518)+(-1.000000000000000D+00) $ *TMP_JAMP(834)+(-1.000000000000000D+00)*TMP_JAMP(1019) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1374) - $ +(-1.000000000000000D+00)*AMP(967)+(-1.000000000000000D+00) + $ +(-1.000000000000000D+00)*AMP(965)+(-1.000000000000000D+00) $ *TMP_JAMP(1479)+TMP_JAMP(1842)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(2085)+(-1.000000000000000D+00) $ *TMP_JAMP(2129)+(-1.000000000000000D+00)*TMP_JAMP(2648) @@ -17940,7 +17940,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(513)+(-1.000000000000000D+00)*TMP_JAMP(809)+( $ -1.000000000000000D+00)*TMP_JAMP(1028)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1373)+(-1.000000000000000D+00) - $ *AMP(975)+(-1.000000000000000D+00)*TMP_JAMP(1963)+TMP_JAMP(2060) + $ *AMP(973)+(-1.000000000000000D+00)*TMP_JAMP(1963)+TMP_JAMP(2060) $ +(-1.000000000000000D+00)*TMP_JAMP(2104)+TMP_JAMP(2317) $ +TMP_JAMP(2387)+TMP_JAMP(2567)+(-1.000000000000000D+00) $ *TMP_JAMP(2604)+TMP_JAMP(2796)+TMP_JAMP(2811)+( @@ -17950,7 +17950,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(470)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(514)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(735)+((0.000000000000000D+00,1.000000000000000D+00)) - $ *TMP_JAMP(1392)+(-1.000000000000000D+00)*AMP(958)+TMP_JAMP(1448) + $ *TMP_JAMP(1392)+(-1.000000000000000D+00)*AMP(956)+TMP_JAMP(1448) $ +(-1.000000000000000D+00)*TMP_JAMP(1839)+((0.000000000000000D $ +00,1.000000000000000D+00))*TMP_JAMP(1846)+(-1.000000000000000D $ +00)*TMP_JAMP(1919)+TMP_JAMP(1963)+(-1.000000000000000D+00) @@ -17960,13 +17960,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(29,1) = ((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(314)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(462)+((0.000000000000000D+00,1.000000000000000D+00)) - $ *TMP_JAMP(717)+(-1.000000000000000D+00)*AMP(966)+TMP_JAMP(1709) + $ *TMP_JAMP(717)+(-1.000000000000000D+00)*AMP(964)+TMP_JAMP(1709) $ +(-1.000000000000000D+00)*TMP_JAMP(1874)+TMP_JAMP(2061) - $ +TMP_JAMP(2129)+AMP(1642)+TMP_JAMP(2445)+(-1.000000000000000D + $ +TMP_JAMP(2129)+AMP(1638)+TMP_JAMP(2445)+(-1.000000000000000D $ +00)*TMP_JAMP(2493)+TMP_JAMP(2647)+TMP_JAMP(2985)+TMP_JAMP(2996) JAMP(30,1) = ((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(320)+((0.000000000000000D+00,1.000000000000000D+00)) - $ *TMP_JAMP(520)+(-1.000000000000000D+00)*AMP(957)+( + $ *TMP_JAMP(520)+(-1.000000000000000D+00)*AMP(955)+( $ -1.000000000000000D+00)*TMP_JAMP(1840)+TMP_JAMP(1874) $ +TMP_JAMP(1919)+TMP_JAMP(1966)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(2064)+TMP_JAMP(2250)+( @@ -17974,7 +17974,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +TMP_JAMP(3000)+TMP_JAMP(3007) JAMP(31,1) = TMP_JAMP(804)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1391)+(-1.000000000000000D+00) - $ *AMP(979)+TMP_JAMP(1857)+TMP_JAMP(1894)+TMP_JAMP(2130) + $ *AMP(977)+TMP_JAMP(1857)+TMP_JAMP(1894)+TMP_JAMP(2130) $ +TMP_JAMP(2609)+(-1.000000000000000D+00)*TMP_JAMP(2816) $ +TMP_JAMP(2825)+(-1.000000000000000D+00)*TMP_JAMP(2863)+( $ -1.000000000000000D+00)*TMP_JAMP(3018) @@ -17982,7 +17982,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(949)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(1147)+TMP_JAMP(1280)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1374)+(-1.000000000000000D+00) - $ *AMP(970)+((0.000000000000000D+00,-1.000000000000000D+00)) + $ *AMP(968)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(2067)+(-1.000000000000000D+00)*TMP_JAMP(2130) $ +TMP_JAMP(2333)+(-1.000000000000000D+00)*TMP_JAMP(2542) $ +TMP_JAMP(2713)+(-1.000000000000000D+00)*TMP_JAMP(2763) @@ -17991,7 +17991,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(33,1) = (-1.000000000000000D+00)*TMP_JAMP(1102)+( $ -1.000000000000000D+00)*TMP_JAMP(1256)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(1391)+(-1.000000000000000D+00) - $ *AMP(977)+(-1.000000000000000D+00)*TMP_JAMP(1688)+( + $ *AMP(975)+(-1.000000000000000D+00)*TMP_JAMP(1688)+( $ -1.000000000000000D+00)*TMP_JAMP(2556)+TMP_JAMP(2811) $ +TMP_JAMP(2817)+TMP_JAMP(2882)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(2976)+(-1.000000000000000D+00) @@ -18009,7 +18009,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(1033)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1152)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1155)+((0.000000000000000D+00,-1.000000000000000D+00)) - $ *TMP_JAMP(1224)+(-1.000000000000000D+00)*AMP(968)+TMP_JAMP(1582) + $ *TMP_JAMP(1224)+(-1.000000000000000D+00)*AMP(966)+TMP_JAMP(1582) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2006) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2105) $ +TMP_JAMP(2514)+TMP_JAMP(2546)+TMP_JAMP(2695)+( @@ -18029,7 +18029,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00))*TMP_JAMP(910)+TMP_JAMP(1277) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1346) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1373) - $ +(-1.000000000000000D+00)*AMP(980)+TMP_JAMP(1883) + $ +(-1.000000000000000D+00)*AMP(978)+TMP_JAMP(1883) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2066) $ +TMP_JAMP(2128)+TMP_JAMP(2609)+(-1.000000000000000D+00) $ *TMP_JAMP(2846)+(-1.000000000000000D+00)*TMP_JAMP(2899)+( @@ -18040,7 +18040,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00))*TMP_JAMP(1143)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(1148)+((0.000000000000000D $ +00,-1.000000000000000D+00))*TMP_JAMP(1392)+( - $ -1.000000000000000D+00)*AMP(961)+(-1.000000000000000D+00) + $ -1.000000000000000D+00)*AMP(959)+(-1.000000000000000D+00) $ *TMP_JAMP(2128)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(2138)+TMP_JAMP(2296)+(-1.000000000000000D+00) $ *TMP_JAMP(2483)+(-1.000000000000000D+00)*TMP_JAMP(2535)+( @@ -18050,7 +18050,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00)*TMP_JAMP(1020)+(-1.000000000000000D+00) $ *TMP_JAMP(1039)+TMP_JAMP(1100)+(-1.000000000000000D+00) $ *TMP_JAMP(1255)+((0.000000000000000D+00,-1.000000000000000D+00)) - $ *TMP_JAMP(1346)+(-1.000000000000000D+00)*AMP(978)+TMP_JAMP(1686) + $ *TMP_JAMP(1346)+(-1.000000000000000D+00)*AMP(976)+TMP_JAMP(1686) $ +(-1.000000000000000D+00)*TMP_JAMP(1799)+((0.000000000000000D $ +00,1.000000000000000D+00))*TMP_JAMP(1988)+(-1.000000000000000D $ +00)*TMP_JAMP(2497)+TMP_JAMP(2591)+(-1.000000000000000D+00) @@ -18072,7 +18072,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ ,1.000000000000000D+00))*TMP_JAMP(1159)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(1211)+(-1.000000000000000D $ +00)*TMP_JAMP(1270)+((0.000000000000000D+00,-1.000000000000000D - $ +00))*TMP_JAMP(1311)+(-1.000000000000000D+00)*AMP(959) + $ +00))*TMP_JAMP(1311)+(-1.000000000000000D+00)*AMP(957) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1784) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1868) $ +(-1.000000000000000D+00)*TMP_JAMP(1939)+((0.000000000000000D @@ -18094,11 +18094,11 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(43,1) = TMP_JAMP(678)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(688)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(949)+TMP_JAMP(1387)+( - $ -1.000000000000000D+00)*AMP(971)+TMP_JAMP(2125)+TMP_JAMP(2127) + $ -1.000000000000000D+00)*AMP(969)+TMP_JAMP(2125)+TMP_JAMP(2127) $ +(-1.000000000000000D+00)*TMP_JAMP(2481)+TMP_JAMP(2497)+( $ -1.000000000000000D+00)*TMP_JAMP(2722)+(-1.000000000000000D+00) $ *TMP_JAMP(2897)+(-1.000000000000000D+00)*TMP_JAMP(2996) - JAMP(44,1) = TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(962)+( + JAMP(44,1) = TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(960)+( $ -1.000000000000000D+00)*TMP_JAMP(2126)+(-1.000000000000000D+00) $ *TMP_JAMP(2127)+(-1.000000000000000D+00)*TMP_JAMP(2535) $ +TMP_JAMP(2556)+(-1.000000000000000D+00)*TMP_JAMP(2730)+( @@ -18107,7 +18107,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(45,1) = ((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(728)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(874)+TMP_JAMP(1382)+(-1.000000000000000D+00) - $ *TMP_JAMP(1387)+(-1.000000000000000D+00)*AMP(969)+TMP_JAMP(1824) + $ *TMP_JAMP(1387)+(-1.000000000000000D+00)*AMP(967)+TMP_JAMP(1824) $ +(-1.000000000000000D+00)*TMP_JAMP(2088)+((0.000000000000000D $ +00,1.000000000000000D+00))*TMP_JAMP(2105)+(-1.000000000000000D $ +00)*TMP_JAMP(2327)+(-1.000000000000000D+00)*TMP_JAMP(2608) @@ -18127,7 +18127,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(47,1) = TMP_JAMP(1129)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1158)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(1303)+(-1.000000000000000D - $ +00)*TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(960) + $ +00)*TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(958) $ +TMP_JAMP(1563)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(2086)+(-1.000000000000000D+00)*TMP_JAMP(2089)+( $ -1.000000000000000D+00)*TMP_JAMP(2364)+TMP_JAMP(2466)+( @@ -18146,21 +18146,21 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(1900)+TMP_JAMP(1972)+TMP_JAMP(2677)+( $ -1.000000000000000D+00)*TMP_JAMP(2897)+TMP_JAMP(2954) JAMP(49,1) = ((0.000000000000000D+00,1.000000000000000D+00)) - $ *TMP_JAMP(1393)+(-1.000000000000000D+00)*AMP(1405) + $ *TMP_JAMP(1393)+(-1.000000000000000D+00)*AMP(1403) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1746) $ +TMP_JAMP(1892)+(-1.000000000000000D+00)*TMP_JAMP(1939) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2136) $ +TMP_JAMP(2579)+TMP_JAMP(2630)+(-1.000000000000000D+00) $ *TMP_JAMP(2836)+TMP_JAMP(2837)+TMP_JAMP(2860)+TMP_JAMP(2990) JAMP(50,1) = ((0.000000000000000D+00,-1.000000000000000D+00)) - $ *TMP_JAMP(1405)+(-1.000000000000000D+00)*AMP(1399)+( + $ *TMP_JAMP(1405)+(-1.000000000000000D+00)*AMP(1397)+( $ -1.000000000000000D+00)*TMP_JAMP(1892)+TMP_JAMP(1938) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1977) $ +TMP_JAMP(2026)+(-1.000000000000000D+00)*TMP_JAMP(2620) $ +TMP_JAMP(2731)+TMP_JAMP(2783)+TMP_JAMP(2938)+TMP_JAMP(2986) JAMP(51,1) = ((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1394)+((0.000000000000000D+00,1.000000000000000D+00)) - $ *TMP_JAMP(1397)+(-1.000000000000000D+00)*AMP(1404) + $ *TMP_JAMP(1397)+(-1.000000000000000D+00)*AMP(1402) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1737) $ +TMP_JAMP(1891)+TMP_JAMP(1937)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(2136)+TMP_JAMP(2575) @@ -18168,11 +18168,11 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00)*TMP_JAMP(2895) JAMP(52,1) = ((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(1176)+TMP_JAMP(1385)+(-1.000000000000000D+00) - $ *AMP(1020)+(-1.000000000000000D+00)*TMP_JAMP(1619)+( + $ *AMP(1018)+(-1.000000000000000D+00)*TMP_JAMP(1619)+( $ -1.000000000000000D+00)*TMP_JAMP(1891)+TMP_JAMP(2145)+( $ -1.000000000000000D+00)*TMP_JAMP(2531)+(-1.000000000000000D+00) $ *TMP_JAMP(2853)+TMP_JAMP(2938)+TMP_JAMP(2988)+TMP_JAMP(3009) - JAMP(53,1) = TMP_JAMP(1415)+(-1.000000000000000D+00)*AMP(1398) + JAMP(53,1) = TMP_JAMP(1415)+(-1.000000000000000D+00)*AMP(1396) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1744) $ +(-1.000000000000000D+00)*TMP_JAMP(1811)+TMP_JAMP(1890) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1977) @@ -18184,7 +18184,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(721)+(-1.000000000000000D+00)*TMP_JAMP(1263) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1295) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1375) - $ +(-1.000000000000000D+00)*AMP(1019)+(-1.000000000000000D+00) + $ +(-1.000000000000000D+00)*AMP(1017)+(-1.000000000000000D+00) $ *TMP_JAMP(1655)+(-1.000000000000000D+00)*TMP_JAMP(1890) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1986) $ +(-1.000000000000000D+00)*TMP_JAMP(2145)+TMP_JAMP(2492) @@ -18194,7 +18194,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(55,1) = ((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1063)+TMP_JAMP(1141)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1177)+(-1.000000000000000D+00) - $ *AMP(1408)+(-1.000000000000000D+00)*TMP_JAMP(1894)+( + $ *AMP(1406)+(-1.000000000000000D+00)*TMP_JAMP(1894)+( $ -1.000000000000000D+00)*TMP_JAMP(2075)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(2108)+(-1.000000000000000D+00) $ *TMP_JAMP(2578)+TMP_JAMP(2821)+(-1.000000000000000D+00) @@ -18203,7 +18203,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(56,1) = TMP_JAMP(647)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(1168)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(1205)+(-1.000000000000000D+00) - $ *AMP(1402)+TMP_JAMP(2047)+((0.000000000000000D+00, + $ *AMP(1400)+TMP_JAMP(2047)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(2108)+(-1.000000000000000D+00) $ *TMP_JAMP(2452)+TMP_JAMP(2814)+(-1.000000000000000D+00) $ *TMP_JAMP(2940)+(-1.000000000000000D+00)*TMP_JAMP(2957)+( @@ -18213,7 +18213,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(1172)+TMP_JAMP(1257)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1301)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(1340)+(-1.000000000000000D+00) - $ *AMP(1406)+TMP_JAMP(1677)+((0.000000000000000D+00 + $ *AMP(1404)+TMP_JAMP(1677)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(2142)+(-1.000000000000000D+00) $ *TMP_JAMP(2820)+TMP_JAMP(2832)+(-1.000000000000000D+00) $ *TMP_JAMP(2909)+((0.000000000000000D+00,-1.000000000000000D+00)) @@ -18233,7 +18233,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(893) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1169) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1209) - $ +TMP_JAMP(1377)+(-1.000000000000000D+00)*AMP(1400) + $ +TMP_JAMP(1377)+(-1.000000000000000D+00)*AMP(1398) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1776) $ +(-1.000000000000000D+00)*TMP_JAMP(2149)+TMP_JAMP(2729)+( $ -1.000000000000000D+00)*TMP_JAMP(2819)+(-1.000000000000000D+00) @@ -18251,7 +18251,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00)*TMP_JAMP(2879)+(-1.000000000000000D+00) $ *TMP_JAMP(2983) JAMP(61,1) = ((0.000000000000000D+00,-1.000000000000000D+00)) - $ *TMP_JAMP(1394)+(-1.000000000000000D+00)*AMP(1409) + $ *TMP_JAMP(1394)+(-1.000000000000000D+00)*AMP(1407) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2106) $ +(-1.000000000000000D+00)*TMP_JAMP(2319)+(-1.000000000000000D $ +00)*TMP_JAMP(2805)+(-1.000000000000000D+00)*TMP_JAMP(2881) @@ -18261,14 +18261,14 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ ,1.000000000000000D+00))*TMP_JAMP(1231)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(1288)+((0.000000000000000D $ +00,1.000000000000000D+00))*TMP_JAMP(1342)+(-1.000000000000000D - $ +00)*AMP(1022)+((0.000000000000000D+00,1.000000000000000D+00)) + $ +00)*AMP(1020)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(2106)+(-1.000000000000000D+00)*TMP_JAMP(2146)+( $ -1.000000000000000D+00)*TMP_JAMP(2271)+TMP_JAMP(2363) $ +TMP_JAMP(2437)+TMP_JAMP(2562)+(-1.000000000000000D+00) $ *TMP_JAMP(2745)+(-1.000000000000000D+00)*TMP_JAMP(2988)+( $ -1.000000000000000D+00)*TMP_JAMP(3022) JAMP(63,1) = (-1.000000000000000D+00)*TMP_JAMP(1380)+( - $ -1.000000000000000D+00)*AMP(1407)+TMP_JAMP(1952) + $ -1.000000000000000D+00)*AMP(1405)+TMP_JAMP(1952) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2142) $ +(-1.000000000000000D+00)*TMP_JAMP(2341)+TMP_JAMP(2452)+( $ -1.000000000000000D+00)*TMP_JAMP(2687)+(-1.000000000000000D+00) @@ -18278,7 +18278,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(311) $ +(-1.000000000000000D+00)*TMP_JAMP(421)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(501)+TMP_JAMP(1380)+( - $ -1.000000000000000D+00)*AMP(947)+((0.000000000000000D+00, + $ -1.000000000000000D+00)*AMP(945)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1544)+TMP_JAMP(1683) $ +TMP_JAMP(1801)+(-1.000000000000000D+00)*TMP_JAMP(2450) $ +TMP_JAMP(2586)+TMP_JAMP(2720)+TMP_JAMP(2869) @@ -18287,7 +18287,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(65,1) = TMP_JAMP(579)+(-1.000000000000000D+00) $ *TMP_JAMP(1008)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1049)+((0.000000000000000D+00,-1.000000000000000D+00)) - $ *TMP_JAMP(1218)+(-1.000000000000000D+00)*AMP(1017) + $ *TMP_JAMP(1218)+(-1.000000000000000D+00)*AMP(1015) $ +TMP_JAMP(1611)+TMP_JAMP(1862)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(1901)+TMP_JAMP(2273)+( $ -1.000000000000000D+00)*TMP_JAMP(2441)+TMP_JAMP(3022) @@ -18304,7 +18304,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00)*TMP_JAMP(2584)+TMP_JAMP(2887)+( $ -1.000000000000000D+00)*TMP_JAMP(2914)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(2975) - JAMP(67,1) = (-1.000000000000000D+00)*AMP(1403)+( + JAMP(67,1) = (-1.000000000000000D+00)*AMP(1401)+( $ -1.000000000000000D+00)*TMP_JAMP(1626)+(-1.000000000000000D+00) $ *TMP_JAMP(2144)+(-1.000000000000000D+00)*TMP_JAMP(2452)+( $ -1.000000000000000D+00)*TMP_JAMP(2678)+TMP_JAMP(2768) @@ -18314,13 +18314,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(1055)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1058)+TMP_JAMP(1275)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(1342)+(-1.000000000000000D+00) - $ *AMP(1021)+((0.000000000000000D+00,-1.000000000000000D+00)) + $ *AMP(1019)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(2116)+TMP_JAMP(2144)+TMP_JAMP(2297)+( $ -1.000000000000000D+00)*TMP_JAMP(2341)+TMP_JAMP(2426)+( $ -1.000000000000000D+00)*TMP_JAMP(2486)+TMP_JAMP(2794)+( $ -1.000000000000000D+00)*TMP_JAMP(2999)+TMP_JAMP(3016) JAMP(69,1) = (-1.000000000000000D+00)*TMP_JAMP(1413)+( - $ -1.000000000000000D+00)*AMP(1401)+TMP_JAMP(2042)+TMP_JAMP(2149) + $ -1.000000000000000D+00)*AMP(1399)+TMP_JAMP(2042)+TMP_JAMP(2149) $ +TMP_JAMP(2578)+TMP_JAMP(2679)+TMP_JAMP(2731)+( $ -1.000000000000000D+00)*TMP_JAMP(2800)+(-1.000000000000000D+00) $ *TMP_JAMP(2883)+TMP_JAMP(3004) @@ -18337,7 +18337,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00)*TMP_JAMP(2961) JAMP(71,1) = ((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(1176)+((0.000000000000000D+00,-1.000000000000000D+00)) - $ *TMP_JAMP(1296)+(-1.000000000000000D+00)*AMP(1018) + $ *TMP_JAMP(1296)+(-1.000000000000000D+00)*AMP(1016) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2091) $ +TMP_JAMP(2343)+(-1.000000000000000D+00)*TMP_JAMP(2800)+( $ -1.000000000000000D+00)*TMP_JAMP(2945)+(-1.000000000000000D+00) @@ -18359,11 +18359,11 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ ,1.000000000000000D+00))*TMP_JAMP(1761)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(1764)+TMP_JAMP(1895)+( $ -1.000000000000000D+00)*TMP_JAMP(1932)+(-1.000000000000000D+00) - $ *AMP(1428)+TMP_JAMP(2569)+(-1.000000000000000D+00) + $ *AMP(1424)+TMP_JAMP(2569)+(-1.000000000000000D+00) $ *TMP_JAMP(2652)+TMP_JAMP(2683)+TMP_JAMP(2786)+TMP_JAMP(2796) $ +TMP_JAMP(2902) JAMP(74,1) = TMP_JAMP(2027)+TMP_JAMP(2042)+(-1.000000000000000D - $ +00)*AMP(1422)+TMP_JAMP(2383)+TMP_JAMP(2580)+( + $ +00)*AMP(1418)+TMP_JAMP(2383)+TMP_JAMP(2580)+( $ -1.000000000000000D+00)*TMP_JAMP(2683)+TMP_JAMP(2735)+( $ -1.000000000000000D+00)*TMP_JAMP(2798)+(-1.000000000000000D+00) $ *TMP_JAMP(2932)+TMP_JAMP(2942)+TMP_JAMP(3008) @@ -18372,14 +18372,14 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(1383)+(-1.000000000000000D+00)*TMP_JAMP(1386) $ +TMP_JAMP(1860)+(-1.000000000000000D+00)*TMP_JAMP(1863)+( $ -1.000000000000000D+00)*TMP_JAMP(1895)+TMP_JAMP(1899)+( - $ -1.000000000000000D+00)*AMP(1427)+TMP_JAMP(2627)+TMP_JAMP(2780) + $ -1.000000000000000D+00)*AMP(1423)+TMP_JAMP(2627)+TMP_JAMP(2780) $ +(-1.000000000000000D+00)*TMP_JAMP(2895)+(-1.000000000000000D $ +00)*TMP_JAMP(2936) JAMP(76,1) = (-1.000000000000000D+00)*TMP_JAMP(1038)+( $ -1.000000000000000D+00)*TMP_JAMP(1107)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1185)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(1203)+(-1.000000000000000D - $ +00)*AMP(1029)+(-1.000000000000000D+00)*TMP_JAMP(1899) + $ +00)*AMP(1027)+(-1.000000000000000D+00)*TMP_JAMP(1899) $ +TMP_JAMP(2043)+(-1.000000000000000D+00)*TMP_JAMP(2095)+( $ -1.000000000000000D+00)*TMP_JAMP(2328)+TMP_JAMP(2458)+( $ -1.000000000000000D+00)*TMP_JAMP(2611)+TMP_JAMP(2649)+( @@ -18388,13 +18388,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +TMP_JAMP(3009) JAMP(77,1) = (-1.000000000000000D+00)*TMP_JAMP(800) $ +TMP_JAMP(1631)+(-1.000000000000000D+00)*TMP_JAMP(1812) - $ +TMP_JAMP(1898)+(-1.000000000000000D+00)*AMP(1421)+( + $ +TMP_JAMP(1898)+(-1.000000000000000D+00)*AMP(1417)+( $ -1.000000000000000D+00)*TMP_JAMP(2332)+TMP_JAMP(2537) $ +TMP_JAMP(2932)+(-1.000000000000000D+00)*TMP_JAMP(2936)+( $ -1.000000000000000D+00)*TMP_JAMP(2972)+TMP_JAMP(3023) JAMP(78,1) = ((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(1216)+(-1.000000000000000D+00)*TMP_JAMP(1264)+( - $ -1.000000000000000D+00)*AMP(1028)+(-1.000000000000000D+00) + $ -1.000000000000000D+00)*AMP(1026)+(-1.000000000000000D+00) $ *TMP_JAMP(1494)+(-1.000000000000000D+00)*TMP_JAMP(1633) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1764) $ +(-1.000000000000000D+00)*TMP_JAMP(1898)+TMP_JAMP(2095)+( @@ -18408,7 +18408,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(1200)+TMP_JAMP(1626)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(1849)+(-1.000000000000000D+00) $ *TMP_JAMP(1883)+(-1.000000000000000D+00)*TMP_JAMP(2036)+( - $ -1.000000000000000D+00)*AMP(1431)+TMP_JAMP(2489)+( + $ -1.000000000000000D+00)*AMP(1427)+TMP_JAMP(2489)+( $ -1.000000000000000D+00)*TMP_JAMP(2505)+(-1.000000000000000D+00) $ *TMP_JAMP(2570)+(-1.000000000000000D+00)*TMP_JAMP(2630) $ +TMP_JAMP(2645)+TMP_JAMP(2686)+(-1.000000000000000D+00) @@ -18417,7 +18417,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ ,1.000000000000000D+00))*TMP_JAMP(1207)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(1291)+TMP_JAMP(2037) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2138) - $ +(-1.000000000000000D+00)*AMP(1425)+(-1.000000000000000D+00) + $ +(-1.000000000000000D+00)*AMP(1421)+(-1.000000000000000D+00) $ *TMP_JAMP(2250)+(-1.000000000000000D+00)*TMP_JAMP(2381)+( $ -1.000000000000000D+00)*TMP_JAMP(2686)+(-1.000000000000000D+00) $ *TMP_JAMP(2699)+TMP_JAMP(2905)+TMP_JAMP(2987)+( @@ -18428,7 +18428,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ ,1.000000000000000D+00))*TMP_JAMP(1349)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(1987)+TMP_JAMP(2020)+( $ -1.000000000000000D+00)*TMP_JAMP(2141)+(-1.000000000000000D+00) - $ *AMP(1429)+(-1.000000000000000D+00)*TMP_JAMP(2773) + $ *AMP(1425)+(-1.000000000000000D+00)*TMP_JAMP(2773) $ +TMP_JAMP(2864)+(-1.000000000000000D+00)*TMP_JAMP(2909) $ +TMP_JAMP(3011) JAMP(82,1) = (-1.000000000000000D+00)*AMP(404) @@ -18448,7 +18448,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(1212)+TMP_JAMP(1268)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1868)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(2011)+(-1.000000000000000D - $ +00)*AMP(1423)+TMP_JAMP(2451)+TMP_JAMP(2699)+( + $ +00)*AMP(1419)+TMP_JAMP(2451)+TMP_JAMP(2699)+( $ -1.000000000000000D+00)*TMP_JAMP(2772)+TMP_JAMP(2917)+( $ -1.000000000000000D+00)*TMP_JAMP(2939)+(-1.000000000000000D+00) $ *TMP_JAMP(2965) @@ -18465,7 +18465,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00)*TMP_JAMP(2761)+(-1.000000000000000D+00) $ *TMP_JAMP(2880)+(-1.000000000000000D+00)*TMP_JAMP(2922) $ +TMP_JAMP(2965) - JAMP(85,1) = TMP_JAMP(1386)+(-1.000000000000000D+00)*AMP(1432)+( + JAMP(85,1) = TMP_JAMP(1386)+(-1.000000000000000D+00)*AMP(1428)+( $ -1.000000000000000D+00)*TMP_JAMP(2372)+TMP_JAMP(2387) $ +TMP_JAMP(2393)+TMP_JAMP(2427)+(-1.000000000000000D+00) $ *TMP_JAMP(2467)+(-1.000000000000000D+00)*TMP_JAMP(2505)+( @@ -18478,14 +18478,14 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(983)+TMP_JAMP(1107)+TMP_JAMP(1127) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1204) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1290) - $ +(-1.000000000000000D+00)*AMP(1031)+TMP_JAMP(2146)+( + $ +(-1.000000000000000D+00)*AMP(1029)+TMP_JAMP(2146)+( $ -1.000000000000000D+00)*TMP_JAMP(2480)+TMP_JAMP(2499)+( $ -1.000000000000000D+00)*TMP_JAMP(2721)+(-1.000000000000000D+00) $ *TMP_JAMP(2896)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(2977)+(-1.000000000000000D+00)*TMP_JAMP(2995) JAMP(87,1) = (-1.000000000000000D+00)*TMP_JAMP(1379)+( $ -1.000000000000000D+00)*TMP_JAMP(1953)+TMP_JAMP(2141)+( - $ -1.000000000000000D+00)*AMP(1430)+TMP_JAMP(2247)+TMP_JAMP(2403) + $ -1.000000000000000D+00)*AMP(1426)+TMP_JAMP(2247)+TMP_JAMP(2403) $ +TMP_JAMP(2882)+TMP_JAMP(2902)+(-1.000000000000000D+00) $ *TMP_JAMP(2929)+TMP_JAMP(3005) JAMP(88,1) = (-1.000000000000000D+00)*AMP(405)+( @@ -18504,7 +18504,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ ,1.000000000000000D+00))*TMP_JAMP(476)+TMP_JAMP(1007) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1052) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1219) - $ +(-1.000000000000000D+00)*AMP(1026)+TMP_JAMP(1696)+( + $ +(-1.000000000000000D+00)*AMP(1024)+TMP_JAMP(1696)+( $ -1.000000000000000D+00)*TMP_JAMP(1722)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(1858)+((0.000000000000000D+00 $ ,-1.000000000000000D+00))*TMP_JAMP(1901)+(-1.000000000000000D @@ -18526,22 +18526,22 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(3003) JAMP(91,1) = TMP_JAMP(647)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(2113)+(-1.000000000000000D+00) - $ *AMP(1426)+TMP_JAMP(2369)+TMP_JAMP(2502)+(-1.000000000000000D + $ *AMP(1422)+TMP_JAMP(2369)+TMP_JAMP(2502)+(-1.000000000000000D $ +00)*TMP_JAMP(2941)+(-1.000000000000000D+00)*TMP_JAMP(3023)+( $ -1.000000000000000D+00)*TMP_JAMP(3024) JAMP(92,1) = ((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(985)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1204)+TMP_JAMP(1261)+TMP_JAMP(1280) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1350) - $ +(-1.000000000000000D+00)*AMP(1030)+((0.000000000000000D+00 + $ +(-1.000000000000000D+00)*AMP(1028)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(2113)+(-1.000000000000000D+00) $ *TMP_JAMP(2143)+TMP_JAMP(2334)+(-1.000000000000000D+00) $ *TMP_JAMP(2545)+TMP_JAMP(2714)+(-1.000000000000000D+00) $ *TMP_JAMP(2762)+TMP_JAMP(2857)+(-1.000000000000000D+00) $ *TMP_JAMP(3002) JAMP(93,1) = ((0.000000000000000D+00,1.000000000000000D+00)) - $ *TMP_JAMP(1769)+(-1.000000000000000D+00)*AMP(1424)+( - $ -1.000000000000000D+00)*AMP(1893)+TMP_JAMP(2465)+TMP_JAMP(2476) + $ *TMP_JAMP(1769)+(-1.000000000000000D+00)*AMP(1420)+( + $ -1.000000000000000D+00)*AMP(1889)+TMP_JAMP(2465)+TMP_JAMP(2476) $ +(-1.000000000000000D+00)*TMP_JAMP(2625)+(-1.000000000000000D $ +00)*TMP_JAMP(2917)+TMP_JAMP(2928)+(-1.000000000000000D+00) $ *TMP_JAMP(2931)+TMP_JAMP(2950)+TMP_JAMP(3024) @@ -18558,7 +18558,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +00))*TMP_JAMP(237)+(-1.000000000000000D+00)*TMP_JAMP(1043) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1250) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1350) - $ +(-1.000000000000000D+00)*AMP(1027)+TMP_JAMP(2135) + $ +(-1.000000000000000D+00)*AMP(1025)+TMP_JAMP(2135) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2148) $ +(-1.000000000000000D+00)*TMP_JAMP(2355)+(-1.000000000000000D $ +00)*TMP_JAMP(2381)+TMP_JAMP(2757)+TMP_JAMP(2779)+( @@ -18578,13 +18578,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(1399)+(-1.000000000000000D+00)*TMP_JAMP(1953)+( $ -1.000000000000000D+00)*TMP_JAMP(2025)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(2121)+(-1.000000000000000D+00) - $ *AMP(1449)+TMP_JAMP(2234)+TMP_JAMP(2634)+(-1.000000000000000D + $ *AMP(1445)+TMP_JAMP(2234)+TMP_JAMP(2634)+(-1.000000000000000D $ +00)*TMP_JAMP(2671)+TMP_JAMP(2689)+TMP_JAMP(2727)+TMP_JAMP(2866) $ +TMP_JAMP(3012) JAMP(98,1) = ((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1401)+TMP_JAMP(1952)+(-1.000000000000000D+00) $ *TMP_JAMP(2022)+((0.000000000000000D+00,-1.000000000000000D+00)) - $ *TMP_JAMP(2118)+(-1.000000000000000D+00)*AMP(1443) + $ *TMP_JAMP(2118)+(-1.000000000000000D+00)*AMP(1439) $ +TMP_JAMP(2390)+(-1.000000000000000D+00)*TMP_JAMP(2408) $ +TMP_JAMP(2456)+(-1.000000000000000D+00)*TMP_JAMP(2689) $ +TMP_JAMP(2841)+TMP_JAMP(2908)+(-1.000000000000000D+00) @@ -18593,13 +18593,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(1018)+TMP_JAMP(1376)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(1378)+TMP_JAMP(1913) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2121) - $ +TMP_JAMP(2124)+(-1.000000000000000D+00)*AMP(1448)+( + $ +TMP_JAMP(2124)+(-1.000000000000000D+00)*AMP(1444)+( $ -1.000000000000000D+00)*TMP_JAMP(2490)+(-1.000000000000000D+00) $ *TMP_JAMP(2638)+TMP_JAMP(2765)+(-1.000000000000000D+00) $ *TMP_JAMP(2843)+TMP_JAMP(2901) JAMP(100,1) = ((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(746)+(-1.000000000000000D+00)*TMP_JAMP(1278)+( - $ -1.000000000000000D+00)*AMP(1038)+(-1.000000000000000D+00) + $ -1.000000000000000D+00)*AMP(1036)+(-1.000000000000000D+00) $ *TMP_JAMP(1913)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(2012)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(2119)+(-1.000000000000000D+00)*TMP_JAMP(2499) @@ -18608,13 +18608,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ -1.000000000000000D+00)*TMP_JAMP(2952)+TMP_JAMP(3020) JAMP(101,1) = TMP_JAMP(1910)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(2118)+TMP_JAMP(2124)+( - $ -1.000000000000000D+00)*AMP(1442)+AMP(1813)+TMP_JAMP(2342)+( + $ -1.000000000000000D+00)*AMP(1438)+AMP(1809)+TMP_JAMP(2342)+( $ -1.000000000000000D+00)*TMP_JAMP(2549)+(-1.000000000000000D+00) $ *TMP_JAMP(2842)+(-1.000000000000000D+00)*TMP_JAMP(2867) $ +TMP_JAMP(2984)+TMP_JAMP(3014) JAMP(102,1) = (-1.000000000000000D+00)*TMP_JAMP(1030) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1404) - $ +(-1.000000000000000D+00)*AMP(1037)+(-1.000000000000000D+00) + $ +(-1.000000000000000D+00)*AMP(1035)+(-1.000000000000000D+00) $ *TMP_JAMP(1809)+(-1.000000000000000D+00)*TMP_JAMP(1910) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2018) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2119) @@ -18624,7 +18624,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(103,1) = ((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1252)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1388)+(-1.000000000000000D+00)*TMP_JAMP(2125)+( - $ -1.000000000000000D+00)*AMP(1452)+TMP_JAMP(2430)+( + $ -1.000000000000000D+00)*AMP(1448)+TMP_JAMP(2430)+( $ -1.000000000000000D+00)*TMP_JAMP(2447)+(-1.000000000000000D+00) $ *TMP_JAMP(2478)+(-1.000000000000000D+00)*TMP_JAMP(2633) $ +TMP_JAMP(2664)+(-1.000000000000000D+00)*TMP_JAMP(2848) @@ -18634,7 +18634,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ ,1.000000000000000D+00))*TMP_JAMP(845)+((0.000000000000000D+00 $ ,1.000000000000000D+00))*TMP_JAMP(962)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(1228)+TMP_JAMP(2126)+( - $ -1.000000000000000D+00)*AMP(1446)+(-1.000000000000000D+00) + $ -1.000000000000000D+00)*AMP(1442)+(-1.000000000000000D+00) $ *TMP_JAMP(2440)+(-1.000000000000000D+00)*TMP_JAMP(2457)+( $ -1.000000000000000D+00)*TMP_JAMP(2580)+TMP_JAMP(2739)+( $ -1.000000000000000D+00)*TMP_JAMP(2830)+(-1.000000000000000D+00) @@ -18644,7 +18644,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(989)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(1388)+(-1.000000000000000D+00)*TMP_JAMP(1670) $ +TMP_JAMP(2088)+((0.000000000000000D+00,-1.000000000000000D+00)) - $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1450) + $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1446) $ +TMP_JAMP(2901)+(-1.000000000000000D+00)*TMP_JAMP(2937)+( $ -1.000000000000000D+00)*TMP_JAMP(2944)+(-1.000000000000000D+00) $ *TMP_JAMP(3026) @@ -18666,7 +18666,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1304) $ +(-1.000000000000000D+00)*TMP_JAMP(1914)+TMP_JAMP(2089) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2137) - $ +(-1.000000000000000D+00)*AMP(1444)+TMP_JAMP(2576) + $ +(-1.000000000000000D+00)*AMP(1440)+TMP_JAMP(2576) $ +TMP_JAMP(2828)+(-1.000000000000000D+00)*TMP_JAMP(2939)+( $ -1.000000000000000D+00)*TMP_JAMP(3026) JAMP(108,1) = (-1.000000000000000D+00)*AMP(411) @@ -18674,7 +18674,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(301) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(334) $ +(-1.000000000000000D+00)*TMP_JAMP(437)+TMP_JAMP(440) - $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(596)+( + $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(594)+( $ -1.000000000000000D+00)*TMP_JAMP(781)+(-1.000000000000000D+00) $ *TMP_JAMP(817)+TMP_JAMP(846)+((0.000000000000000D+00, $ -1.000000000000000D+00))*TMP_JAMP(977)+((0.000000000000000D+00, @@ -18689,7 +18689,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1378) $ +(-1.000000000000000D+00)*TMP_JAMP(1884)+(-1.000000000000000D $ +00)*TMP_JAMP(2039)+((0.000000000000000D+00,-1.000000000000000D - $ +00))*TMP_JAMP(2068)+(-1.000000000000000D+00)*AMP(1453)+( + $ +00))*TMP_JAMP(2068)+(-1.000000000000000D+00)*AMP(1449)+( $ -1.000000000000000D+00)*TMP_JAMP(2357)+TMP_JAMP(2523)+( $ -1.000000000000000D+00)*TMP_JAMP(2573)+TMP_JAMP(2678)+( $ -1.000000000000000D+00)*TMP_JAMP(2766)+TMP_JAMP(2775)+( @@ -18697,7 +18697,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(110,1) = ((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(990)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1248)+TMP_JAMP(1277)+(-1.000000000000000D+00) - $ *AMP(1040)+((0.000000000000000D+00,1.000000000000000D+00)) + $ *AMP(1038)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(1852)+TMP_JAMP(1884)+TMP_JAMP(2040) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2116) $ +(-1.000000000000000D+00)*TMP_JAMP(2338)+(-1.000000000000000D @@ -18705,7 +18705,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +TMP_JAMP(3015)+(-1.000000000000000D+00)*TMP_JAMP(3020) JAMP(111,1) = TMP_JAMP(1516)+(-1.000000000000000D+00) $ *TMP_JAMP(1932)+((0.000000000000000D+00,1.000000000000000D+00)) - $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1451)+( + $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1447)+( $ -1.000000000000000D+00)*TMP_JAMP(2371)+TMP_JAMP(2519) $ +TMP_JAMP(2572)+(-1.000000000000000D+00)*TMP_JAMP(2679) $ +TMP_JAMP(2695)+TMP_JAMP(2787)+((0.000000000000000D+00 @@ -18724,7 +18724,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(78)+((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(321)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(739)+(-1.000000000000000D+00)*TMP_JAMP(1272)+( - $ -1.000000000000000D+00)*AMP(1035)+(-1.000000000000000D+00) + $ -1.000000000000000D+00)*AMP(1033)+(-1.000000000000000D+00) $ *TMP_JAMP(1810)+((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(2091)+TMP_JAMP(2803)+(-1.000000000000000D+00) $ *TMP_JAMP(2933)+TMP_JAMP(2991)+(-1.000000000000000D+00) @@ -18745,15 +18745,15 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(2915)+(-1.000000000000000D+00)*TMP_JAMP(2991) JAMP(115,1) = ((0.000000000000000D+00,-1.000000000000000D+00)) $ *TMP_JAMP(589)+((0.000000000000000D+00,-1.000000000000000D+00)) - $ *TMP_JAMP(2122)+(-1.000000000000000D+00)*AMP(1447)+( + $ *TMP_JAMP(2122)+(-1.000000000000000D+00)*AMP(1443)+( $ -1.000000000000000D+00)*TMP_JAMP(2373)+TMP_JAMP(2550)+( $ -1.000000000000000D+00)*TMP_JAMP(2574)+(-1.000000000000000D+00) $ *TMP_JAMP(2582)+(-1.000000000000000D+00)*TMP_JAMP(2626) $ +TMP_JAMP(2629)+TMP_JAMP(2941)+(-1.000000000000000D+00) $ *TMP_JAMP(3014) - JAMP(116,1) = TMP_JAMP(1279)+(-1.000000000000000D+00)*AMP(1039) + JAMP(116,1) = TMP_JAMP(1279)+(-1.000000000000000D+00)*AMP(1037) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2122) - $ +TMP_JAMP(2143)+AMP(1669)+(-1.000000000000000D+00) + $ +TMP_JAMP(2143)+AMP(1665)+(-1.000000000000000D+00) $ *TMP_JAMP(2371)+(-1.000000000000000D+00)*TMP_JAMP(2619)+( $ -1.000000000000000D+00)*TMP_JAMP(2823)+TMP_JAMP(2853)+( $ -1.000000000000000D+00)*TMP_JAMP(2989)+(-1.000000000000000D+00) @@ -18761,7 +18761,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) JAMP(117,1) = ((0.000000000000000D+00,1.000000000000000D+00)) $ *TMP_JAMP(589)+(-1.000000000000000D+00)*TMP_JAMP(1658) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2137) - $ +(-1.000000000000000D+00)*AMP(1445)+AMP(1519)+TMP_JAMP(2596) + $ +(-1.000000000000000D+00)*AMP(1441)+AMP(1515)+TMP_JAMP(2596) $ +TMP_JAMP(2624)+TMP_JAMP(2633)+TMP_JAMP(2884)+TMP_JAMP(2908)+( $ -1.000000000000000D+00)*TMP_JAMP(2928)+TMP_JAMP(2959) JAMP(118,1) = ((0.000000000000000D+00,1.000000000000000D+00)) @@ -18777,7 +18777,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ *TMP_JAMP(2858)+TMP_JAMP(2918)+(-1.000000000000000D+00) $ *TMP_JAMP(2959) JAMP(119,1) = (-1.000000000000000D+00)*TMP_JAMP(1041)+( - $ -1.000000000000000D+00)*AMP(1036)+TMP_JAMP(1608) + $ -1.000000000000000D+00)*AMP(1034)+TMP_JAMP(1608) $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2148) $ +(-1.000000000000000D+00)*TMP_JAMP(2614)+TMP_JAMP(2635) $ +TMP_JAMP(2933)+TMP_JAMP(2992)+TMP_JAMP(3019) @@ -18790,7 +18790,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) $ +TMP_JAMP(531)+(-1.000000000000000D+00)*TMP_JAMP(1418)+( $ -1.000000000000000D+00)*TMP_JAMP(1673)+TMP_JAMP(1724) $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1797) - $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1462) + $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1458) $ +TMP_JAMP(2619)+(-1.000000000000000D+00)*TMP_JAMP(2634) $ +TMP_JAMP(2670)+(-1.000000000000000D+00)*TMP_JAMP(2916)+( $ -1.000000000000000D+00)*TMP_JAMP(2992) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py index ef1bf58979..f0d38c2e5a 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py @@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt): except Exception as error: import launch_plugin target_class = launch_plugin.RunCard - + elif issubclass(finput, RunCard): + target_class = finput else: return None @@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False, if python_template and not to_write: import string if self.blocks: - text = string.Template(text) mapping = {} for b in self.blocks: mapping[b.name] = b.get_template(self) - text = text.substitute(mapping) + if "$%s" % b.name not in text: + text += "\n$%s\n" % b.name + text = string.Template(text).substitute(mapping) if not self.list_parameter: text = text % self diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index e8d8232be5..73a2d9596c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005519866943359375  +DEBUG: model prefixing takes 0.00565791130065918  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.893 s +1 processes with 1240 diagrams generated in 1.891 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  +DEBUG: type(subproc_group)= [output.py at line 193]  +DEBUG: type(fortran_model)= [output.py at line 194]  +DEBUG: type(me)= me=0 [output.py at line 195]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.604 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.621 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.356 s +ALOHA: aloha creates 5 routines in 0.351 s VVV1 VVV1 FFV1 @@ -204,9 +204,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  quit -real 0m13.085s -user 0m12.921s -sys 0m0.106s +real 0m13.161s +user 0m12.961s +sys 0m0.105s diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 2338d395b7..3fcb694ccd 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005399465560913086  +DEBUG: model prefixing takes 0.005532264709472656  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -177,10 +177,10 @@ Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -207,15 +207,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -224,19 +224,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  1 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s -Wrote files for 32 helas calls in 0.222 s +Wrote files for 32 helas calls in 0.224 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines ALOHA: aloha creates 2 routines in 0.147 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines @@ -260,7 +260,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -287,16 +287,16 @@ Hunk #2 succeeded at 162 (offset 19 lines). Hunk #3 succeeded at 247 (offset 26 lines). Hunk #4 succeeded at 281 (offset 32 lines). Hunk #5 succeeded at 326 (offset 32 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 235]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m1.957s -user 0m1.708s -sys 0m0.241s +real 0m1.962s +user 0m1.726s +sys 0m0.237s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py index ef1bf58979..f0d38c2e5a 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py @@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt): except Exception as error: import launch_plugin target_class = launch_plugin.RunCard - + elif issubclass(finput, RunCard): + target_class = finput else: return None @@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False, if python_template and not to_write: import string if self.blocks: - text = string.Template(text) mapping = {} for b in self.blocks: mapping[b.name] = b.get_template(self) - text = text.substitute(mapping) + if "$%s" % b.name not in text: + text += "\n$%s\n" % b.name + text = string.Template(text).substitute(mapping) if not self.list_parameter: text = text % self diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index ad74707ae9..06d5354735 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005573272705078125  +DEBUG: model prefixing takes 0.0056154727935791016  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,14 +170,14 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.079 s +8 processes with 40 diagrams generated in 0.080 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 @@ -190,28 +190,28 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  +DEBUG: type(subproc_group)= [output.py at line 193]  +DEBUG: type(fortran_model)= [output.py at line 194]  +DEBUG: type(me)= me=0 [output.py at line 195]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=1 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  +DEBUG: type(subproc_group)= [output.py at line 193]  +DEBUG: type(fortran_model)= [output.py at line 194]  +DEBUG: type(me)= me=1 [output.py at line 195]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.145 s +ALOHA: aloha creates 2 routines in 0.146 s FFV1 FFV1 FFV1 @@ -225,9 +225,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  quit -real 0m0.658s -user 0m0.590s -sys 0m0.062s +real 0m0.655s +user 0m0.595s +sys 0m0.055s diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 9d96566eb2..645c0db954 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -135,22 +135,22 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_h Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  +DEBUG: type(subproc_group)= [output.py at line 193]  +DEBUG: type(fortran_model)= [output.py at line 194]  +DEBUG: type(me)= me=0 [output.py at line 195]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates 1 routines in 0.062 s @@ -163,9 +163,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  quit -real 0m0.429s +real 0m0.430s user 0m0.371s -sys 0m0.051s +sys 0m0.055s diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index bb2844f553..1d0d9e2a35 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00551915168762207  +DEBUG: model prefixing takes 0.005470752716064453  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -378,17 +378,17 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.844 s +65 processes with 1119 diagrams generated in 1.856 s Total: 83 processes with 1202 diagrams output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  @@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -506,15 +506,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -523,15 +523,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  1 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -540,15 +540,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  2 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -557,15 +557,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  3 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -574,15 +574,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  4 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -591,15 +591,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  5 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -608,15 +608,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711]  DEBUG: subproc_number =  6 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -625,15 +625,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711]  DEBUG: subproc_number =  7 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -642,15 +642,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711]  DEBUG: subproc_number =  8 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -659,15 +659,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  9 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -676,15 +676,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  10 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -693,15 +693,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  11 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -710,15 +710,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  12 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -727,15 +727,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  13 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -744,15 +744,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  14 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -761,15 +761,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  15 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -778,15 +778,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3] [export_cpp.py at line 711]  DEBUG: subproc_number =  16 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6226]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -795,29 +795,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1] [export_cpp.py at line 711]  DEBUG: subproc_number =  17 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1862]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.298 s -Wrote files for 810 helas calls in 3.297 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.304 s +Wrote files for 810 helas calls in 3.574 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.339 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.355 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.315 s +ALOHA: aloha creates 10 routines in 0.318 s VVV1 VVV1 FFV1 @@ -844,7 +844,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -1021,16 +1021,16 @@ Hunk #2 succeeded at 194 (offset 51 lines). Hunk #3 succeeded at 272 (offset 51 lines). Hunk #4 succeeded at 300 (offset 51 lines). Hunk #5 succeeded at 345 (offset 51 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 235]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m8.967s -user 0m8.408s -sys 0m0.506s +real 0m9.272s +user 0m8.475s +sys 0m0.501s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py index ef1bf58979..f0d38c2e5a 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py @@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt): except Exception as error: import launch_plugin target_class = launch_plugin.RunCard - + elif issubclass(finput, RunCard): + target_class = finput else: return None @@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False, if python_template and not to_write: import string if self.blocks: - text = string.Template(text) mapping = {} for b in self.blocks: mapping[b.name] = b.get_template(self) - text = text.substitute(mapping) + if "$%s" % b.name not in text: + text += "\n$%s\n" % b.name + text = string.Template(text).substitute(mapping) if not self.list_parameter: text = text % self From 1fd1c4c5f493c21c3b271f980571db21c604bc7c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 8 Nov 2023 20:51:09 +0100 Subject: [PATCH 04/14] [actions/gpucpp] TEMPORARILY disable testsuite on PRs (gh extension install actions/gh-actions-cache gives 'HTTP 403: API rate limit exceeded') --- .github/workflows/testsuite_allprocesses.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/testsuite_allprocesses.yml b/.github/workflows/testsuite_allprocesses.yml index 7eaad09c9f..662284f944 100644 --- a/.github/workflows/testsuite_allprocesses.yml +++ b/.github/workflows/testsuite_allprocesses.yml @@ -15,8 +15,9 @@ on: workflow_dispatch: # Trigger the all-processes workflow for pull requests to master - pull_request: - branches: [ master ] + # TEMPORARILY disable these tests on PRs (gh extension install actions/gh-actions-cache gives 'HTTP 403: API rate limit exceeded') + ###pull_request: + ### branches: [ master ] # Trigger the all-processes workflow when new changes to the workflow are pushed push: From a4f748717dd57b3632caf5947e6fb48e22f2831a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 10:11:16 +0100 Subject: [PATCH 05/14] [gpucpp] rerun 78 tput tests, with FPEs enabled in the check executable - usual failures in ggttg f/m and gqttq f (#783), no change in performance --- .../log_eemumu_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl1_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl1_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl1_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl1_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_m_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl1_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl1_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl1_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl1_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_m_inl0_hrd1.txt | 86 +++++++-------- .../log_ggttg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd0.txt | 36 +++---- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 36 +++---- .../log_ggttg_mad_f_inl0_hrd1.txt | 36 +++---- .../log_ggttg_mad_m_inl0_hrd0.txt | 36 +++---- .../log_ggttg_mad_m_inl0_hrd1.txt | 36 +++---- .../log_ggttgg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_gqttq_mad_f_inl0_hrd0.txt | 92 ++++++++-------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 92 ++++++++-------- .../log_gqttq_mad_f_inl0_hrd1.txt | 92 ++++++++-------- .../log_gqttq_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_gqttq_mad_m_inl0_hrd1.txt | 100 +++++++++--------- 78 files changed, 3476 insertions(+), 3476 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 4e0cc4f360..4f18003d70 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:00:16 +DATE: 2023-11-08_21:15:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.995135e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.942022e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.073010e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.482370e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.785159e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.963951e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.649523 sec - 2,606,897,569 cycles # 2.955 GHz - 4,039,165,920 instructions # 1.55 insn per cycle - 0.938736477 seconds time elapsed +TOTAL : 0.677103 sec + 2,617,238,862 cycles # 2.883 GHz + 4,033,048,225 instructions # 1.54 insn per cycle + 0.968798898 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.116390e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.309346e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.309346e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.115937e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309320e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.309320e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.039128 sec - 18,293,625,810 cycles # 3.027 GHz - 44,037,997,118 instructions # 2.41 insn per cycle - 6.044375342 seconds time elapsed +TOTAL : 6.040810 sec + 18,355,110,031 cycles # 3.037 GHz + 44,036,146,715 instructions # 2.40 insn per cycle + 6.046149721 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.650519e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.159299e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.159299e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.614682e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.109953e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.109953e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.212186 sec - 12,761,177,625 cycles # 3.027 GHz - 31,004,602,670 instructions # 2.43 insn per cycle - 4.217391637 seconds time elapsed +TOTAL : 4.305087 sec + 12,797,655,048 cycles # 2.970 GHz + 31,002,550,325 instructions # 2.42 insn per cycle + 4.310429047 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.065360e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.886676e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.886676e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.058335e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.864325e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.864325e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.440327 sec - 10,045,086,881 cycles # 2.916 GHz - 19,380,193,658 instructions # 1.93 insn per cycle - 3.445672409 seconds time elapsed +TOTAL : 3.453382 sec + 10,049,928,632 cycles # 2.906 GHz + 19,377,949,384 instructions # 1.93 insn per cycle + 3.458678566 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1965) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.092180e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.955480e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.955480e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.139569e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.018506e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.018506e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.409304 sec - 9,718,965,428 cycles # 2.848 GHz - 18,998,332,681 instructions # 1.95 insn per cycle - 3.414677998 seconds time elapsed +TOTAL : 3.335195 sec + 9,699,652,158 cycles # 2.904 GHz + 18,994,942,569 instructions # 1.96 insn per cycle + 3.340655484 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1689) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.821062e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.417007e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.417007e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.800324e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.389989e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.389989e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.852694 sec - 8,598,148,642 cycles # 2.229 GHz - 15,740,848,417 instructions # 1.83 insn per cycle - 3.858015954 seconds time elapsed +TOTAL : 3.895197 sec + 8,617,547,988 cycles # 2.211 GHz + 15,739,004,417 instructions # 1.83 insn per cycle + 3.900641958 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 900) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index a2a2220e0b..60971ecd43 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:34:09 +DATE: 2023-11-08_21:50:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.616160e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.542311e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.542311e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.736559e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.745060e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.745060e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.257075 sec - 7,500,299,564 cycles # 3.000 GHz - 13,128,281,558 instructions # 1.75 insn per cycle - 2.557069801 seconds time elapsed +TOTAL : 2.222962 sec + 7,400,904,179 cycles # 2.991 GHz + 13,138,789,289 instructions # 1.78 insn per cycle + 2.532867460 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.074156e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.251964e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.251964e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.078362e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.258405e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.258405e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.457469 sec - 19,613,725,947 cycles # 3.035 GHz - 44,260,538,354 instructions # 2.26 insn per cycle - 6.464068851 seconds time elapsed +TOTAL : 6.440995 sec + 19,547,511,222 cycles # 3.033 GHz + 44,263,760,517 instructions # 2.26 insn per cycle + 6.447379338 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.537992e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.980628e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.980628e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.568240e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.019266e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.019266e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.703362 sec - 14,014,545,412 cycles # 2.976 GHz - 31,843,317,256 instructions # 2.27 insn per cycle - 4.710044451 seconds time elapsed +TOTAL : 4.623039 sec + 14,052,579,459 cycles # 3.037 GHz + 31,844,500,266 instructions # 2.27 insn per cycle + 4.629479950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.930954e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.630364e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.630364e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.863308e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.529884e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.529884e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.870178 sec - 11,351,058,249 cycles # 2.929 GHz - 20,737,271,008 instructions # 1.83 insn per cycle - 3.876822605 seconds time elapsed +TOTAL : 4.004138 sec + 11,314,763,691 cycles # 2.822 GHz + 20,739,815,252 instructions # 1.83 insn per cycle + 4.010963262 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1965) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.936889e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.651989e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.651989e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.961498e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.695721e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.695721e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.871998 sec - 11,000,759,855 cycles # 2.837 GHz - 20,365,657,381 instructions # 1.85 insn per cycle - 3.879015734 seconds time elapsed +TOTAL : 3.824549 sec + 10,997,567,801 cycles # 2.871 GHz + 20,355,988,697 instructions # 1.85 insn per cycle + 3.831152322 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1689) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.694377e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.207135e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.207135e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.664769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.161936e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.161936e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.335020 sec - 9,935,731,633 cycles # 2.289 GHz - 16,882,918,411 instructions # 1.70 insn per cycle - 4.341683669 seconds time elapsed +TOTAL : 4.405341 sec + 9,931,414,577 cycles # 2.252 GHz + 16,884,401,146 instructions # 1.70 insn per cycle + 4.411803387 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 900) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index dedce3e2ef..75e14339dc 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:47:12 +DATE: 2023-11-08_22:03:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.493472e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.526211e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.980085e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.826607e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.612761e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.962341e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.335531 sec - 4,653,241,552 cycles # 2.971 GHz - 7,232,975,239 instructions # 1.55 insn per cycle - 1.623039981 seconds time elapsed +TOTAL : 1.301469 sec + 4,673,993,383 cycles # 3.055 GHz + 7,270,667,887 instructions # 1.56 insn per cycle + 1.586588942 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.100587e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.292616e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.292616e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.143440e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.343019e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.343019e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.487751 sec - 19,390,492,430 cycles # 2.987 GHz - 44,137,957,280 instructions # 2.28 insn per cycle - 6.493082825 seconds time elapsed +TOTAL : 6.249393 sec + 19,374,513,863 cycles # 3.098 GHz + 44,137,807,645 instructions # 2.28 insn per cycle + 6.254447436 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.649039e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.157189e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.157189e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.651049e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.163460e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.163460e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.573606 sec - 13,864,290,699 cycles # 3.029 GHz - 31,004,021,041 instructions # 2.24 insn per cycle - 4.579072706 seconds time elapsed +TOTAL : 4.566003 sec + 13,842,407,454 cycles # 3.029 GHz + 31,004,270,304 instructions # 2.24 insn per cycle + 4.571383086 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.050077e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.865714e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.865714e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.085679e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.913536e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.913536e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.825144 sec - 11,151,950,602 cycles # 2.912 GHz - 19,279,192,444 instructions # 1.73 insn per cycle - 3.830421553 seconds time elapsed +TOTAL : 3.759234 sec + 11,164,737,043 cycles # 2.967 GHz + 19,280,466,147 instructions # 1.73 insn per cycle + 3.764531843 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1965) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.125943e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.996151e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.996151e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.157188e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.041275e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.041275e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.721741 sec - 10,820,749,101 cycles # 2.904 GHz - 18,706,645,976 instructions # 1.73 insn per cycle - 3.727088912 seconds time elapsed +TOTAL : 3.667981 sec + 10,833,619,022 cycles # 2.950 GHz + 18,695,779,485 instructions # 1.73 insn per cycle + 3.673091045 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1689) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.802766e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.399092e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.399092e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.852503e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.471081e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.471081e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.260983 sec - 9,758,383,682 cycles # 2.288 GHz - 15,439,422,037 instructions # 1.58 insn per cycle - 4.266311634 seconds time elapsed +TOTAL : 4.150740 sec + 9,740,231,931 cycles # 2.344 GHz + 15,438,395,407 instructions # 1.59 insn per cycle + 4.156220859 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 900) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 753c8feb62..c2852b0755 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:43:56 +DATE: 2023-11-08_22:00:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.492551e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.537742e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.994776e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.830407e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.634363e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.010779e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.978991 sec - 3,581,699,122 cycles # 2.964 GHz - 7,061,755,742 instructions # 1.97 insn per cycle - 1.265379690 seconds time elapsed +TOTAL : 0.985656 sec + 3,531,228,063 cycles # 2.913 GHz + 6,990,251,865 instructions # 1.98 insn per cycle + 1.270939740 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.108457e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.301315e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.301315e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.143065e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.342569e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.342569e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.081290 sec - 18,339,334,415 cycles # 3.014 GHz - 44,033,842,254 instructions # 2.40 insn per cycle - 6.086519540 seconds time elapsed +TOTAL : 5.897072 sec + 18,280,833,177 cycles # 3.098 GHz + 44,034,372,908 instructions # 2.41 insn per cycle + 5.902241793 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.647910e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.158230e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.158230e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.647739e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.157991e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.157991e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.219825 sec - 12,790,482,904 cycles # 3.028 GHz - 31,000,190,511 instructions # 2.42 insn per cycle - 4.225042583 seconds time elapsed +TOTAL : 4.221863 sec + 12,803,042,604 cycles # 3.036 GHz + 31,005,296,735 instructions # 2.42 insn per cycle + 4.227230772 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.046562e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.846964e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.846964e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.083518e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.912332e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.912332e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.470466 sec - 10,075,062,185 cycles # 2.899 GHz - 19,376,808,574 instructions # 1.92 insn per cycle - 3.475725491 seconds time elapsed +TOTAL : 3.412904 sec + 10,065,358,042 cycles # 2.945 GHz + 19,377,556,628 instructions # 1.93 insn per cycle + 3.418078261 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1965) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.091991e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.948349e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.948349e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.178157e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.068476e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.068476e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.411832 sec - 9,706,821,336 cycles # 2.841 GHz - 18,993,945,887 instructions # 1.96 insn per cycle - 3.417093831 seconds time elapsed +TOTAL : 3.275616 sec + 9,709,500,834 cycles # 2.960 GHz + 18,994,586,612 instructions # 1.96 insn per cycle + 3.280821668 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1689) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.817313e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.417390e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.417390e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.874008e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.497430e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.497430e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.864825 sec - 8,629,354,000 cycles # 2.231 GHz - 15,737,585,107 instructions # 1.82 insn per cycle - 3.870285071 seconds time elapsed +TOTAL : 3.750555 sec + 8,607,389,256 cycles # 2.292 GHz + 15,737,632,725 instructions # 1.83 insn per cycle + 3.755880546 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 900) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 8472c31bea..6a5b6e889f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:40:37 +DATE: 2023-11-08_21:57:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.065913e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.488032e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.905997e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.203248e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.569989e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.906875e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.876732 sec - 6,299,612,348 cycles # 2.989 GHz - 11,571,253,190 instructions # 1.84 insn per cycle - 2.164294467 seconds time elapsed +TOTAL : 1.845079 sec + 6,274,121,781 cycles # 3.027 GHz + 11,554,949,617 instructions # 1.84 insn per cycle + 2.129841068 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.111600e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.304742e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.304742e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.133729e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.330465e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.330465e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.064598 sec - 18,297,128,822 cycles # 3.015 GHz - 44,033,779,580 instructions # 2.41 insn per cycle - 6.069938342 seconds time elapsed +TOTAL : 5.944694 sec + 18,288,311,212 cycles # 3.074 GHz + 44,034,741,687 instructions # 2.41 insn per cycle + 5.950018785 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.622403e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.120612e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.120612e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.659128e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.174088e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.174088e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.285433 sec - 12,790,120,071 cycles # 2.982 GHz - 31,000,688,554 instructions # 2.42 insn per cycle - 4.290779048 seconds time elapsed +TOTAL : 4.192977 sec + 12,790,691,952 cycles # 3.048 GHz + 31,002,731,251 instructions # 2.42 insn per cycle + 4.198334883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.044295e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.854365e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.854365e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.084534e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.927805e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.927805e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.476131 sec - 10,066,944,453 cycles # 2.893 GHz - 19,377,002,166 instructions # 1.92 insn per cycle - 3.481530813 seconds time elapsed +TOTAL : 3.410971 sec + 10,102,470,059 cycles # 2.959 GHz + 19,378,571,736 instructions # 1.92 insn per cycle + 3.416356813 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1965) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.095206e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.953285e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.953285e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.180416e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.077058e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.077058e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.401536 sec - 9,758,102,764 cycles # 2.865 GHz - 18,996,151,120 instructions # 1.95 insn per cycle - 3.406936941 seconds time elapsed +TOTAL : 3.272531 sec + 9,723,824,348 cycles # 2.967 GHz + 19,005,371,454 instructions # 1.95 insn per cycle + 3.277801420 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1689) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.814025e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.410019e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.410019e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.875765e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.503453e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.503453e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.870433 sec - 8,615,604,376 cycles # 2.224 GHz - 15,736,922,136 instructions # 1.83 insn per cycle - 3.875834680 seconds time elapsed +TOTAL : 3.745624 sec + 8,623,946,797 cycles # 2.300 GHz + 15,739,753,667 instructions # 1.83 insn per cycle + 3.750856873 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 900) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index b542059ad1..3b69c80285 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:00:50 +DATE: 2023-11-08_21:15:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.000398e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.960570e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.110004e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.519106e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.841619e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.067099e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.647549 sec - 2,611,748,045 cycles # 2.979 GHz - 4,046,502,501 instructions # 1.55 insn per cycle - 0.933750268 seconds time elapsed +TOTAL : 0.661524 sec + 2,624,385,702 cycles # 2.945 GHz + 4,009,504,923 instructions # 1.53 insn per cycle + 0.953550123 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.159227e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.372064e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.372064e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.178868e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.397031e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.397031e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.833108 sec - 17,445,226,847 cycles # 2.989 GHz - 41,885,202,351 instructions # 2.40 insn per cycle - 5.838346819 seconds time elapsed +TOTAL : 5.737674 sec + 17,431,892,883 cycles # 3.036 GHz + 41,881,565,184 instructions # 2.40 insn per cycle + 5.743076445 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.682893e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.222491e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.222491e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.685142e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.222963e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.222963e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.142121 sec - 12,470,632,862 cycles # 3.008 GHz - 30,166,171,065 instructions # 2.42 insn per cycle - 4.147564686 seconds time elapsed +TOTAL : 4.136316 sec + 12,482,235,541 cycles # 3.016 GHz + 30,165,183,766 instructions # 2.42 insn per cycle + 4.141750487 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1611) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.069225e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.895121e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.895121e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.065221e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.894043e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.894043e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.437470 sec - 9,952,077,094 cycles # 2.891 GHz - 19,112,450,451 instructions # 1.92 insn per cycle - 3.442739539 seconds time elapsed +TOTAL : 3.443708 sec + 9,960,024,892 cycles # 2.889 GHz + 19,109,707,129 instructions # 1.92 insn per cycle + 3.449179794 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1930) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.130212e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.018241e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.018241e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.139235e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.013091e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.013091e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.352335 sec - 9,644,260,853 cycles # 2.874 GHz - 18,779,667,176 instructions # 1.95 insn per cycle - 3.357742942 seconds time elapsed +TOTAL : 3.337798 sec + 9,694,110,840 cycles # 2.900 GHz + 18,764,903,742 instructions # 1.94 insn per cycle + 3.343110507 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1661) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.865497e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.495990e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.495990e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.864706e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.496201e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.496201e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.772482 sec - 8,452,356,069 cycles # 2.238 GHz - 15,617,271,494 instructions # 1.85 insn per cycle - 3.777813091 seconds time elapsed +TOTAL : 3.773287 sec + 8,448,094,450 cycles # 2.236 GHz + 15,614,366,385 instructions # 1.85 insn per cycle + 3.778658466 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 886) (512y: 156) (512z: 1239) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 9fba89aff3..abd8e16103 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:23:25 +DATE: 2023-11-08_21:39:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.483432e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.567049e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.058193e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.541150e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.656561e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.025623e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.676370 sec - 2,703,741,341 cycles # 2.971 GHz - 4,197,515,180 instructions # 1.55 insn per cycle - 0.967825669 seconds time elapsed +TOTAL : 0.677402 sec + 2,672,042,758 cycles # 2.933 GHz + 4,104,960,698 instructions # 1.54 insn per cycle + 0.969965661 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.672486e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.141310e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.141310e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.643045e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.106548e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.106548e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.163173 sec - 12,692,329,334 cycles # 3.045 GHz - 32,576,040,648 instructions # 2.57 insn per cycle - 4.168672183 seconds time elapsed +TOTAL : 4.237683 sec + 12,698,973,738 cycles # 2.997 GHz + 32,580,365,424 instructions # 2.57 insn per cycle + 4.243310096 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.116856e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.025219e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.025219e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.102523e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.004727e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.004727e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.372207 sec - 10,267,724,267 cycles # 3.041 GHz - 24,505,197,015 instructions # 2.39 insn per cycle - 3.377809241 seconds time elapsed +TOTAL : 3.394812 sec + 10,279,599,861 cycles # 3.024 GHz + 24,505,440,482 instructions # 2.38 insn per cycle + 3.400499086 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.304978e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.380785e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.380785e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.301834e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.372180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.372180e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.125688 sec - 9,128,103,141 cycles # 2.916 GHz - 16,940,836,203 instructions # 1.86 insn per cycle - 3.131242434 seconds time elapsed +TOTAL : 3.131325 sec + 9,114,816,336 cycles # 2.906 GHz + 16,941,253,973 instructions # 1.86 insn per cycle + 3.136898880 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.298021e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.382509e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.382509e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.334227e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.444641e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.444641e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.144282 sec - 8,899,696,508 cycles # 2.834 GHz - 16,372,313,838 instructions # 1.84 insn per cycle - 3.149838418 seconds time elapsed +TOTAL : 3.093526 sec + 8,877,539,414 cycles # 2.866 GHz + 16,358,190,505 instructions # 1.84 insn per cycle + 3.099088246 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.053092e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.845549e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.845549e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.978126e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.726122e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.726122e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.465226 sec - 7,910,184,141 cycles # 2.280 GHz - 14,591,740,895 instructions # 1.84 insn per cycle - 3.470686114 seconds time elapsed +TOTAL : 3.588578 sec + 7,927,907,472 cycles # 2.207 GHz + 14,594,253,089 instructions # 1.84 insn per cycle + 3.594362581 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 9b85799057..d14dcc2cec 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:23:55 +DATE: 2023-11-08_21:40:11 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.480686e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.569964e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.063993e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.548142e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.673863e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.063444e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.677772 sec - 2,691,282,086 cycles # 2.960 GHz - 4,219,338,579 instructions # 1.57 insn per cycle - 0.971577356 seconds time elapsed +TOTAL : 0.673757 sec + 2,682,929,459 cycles # 2.958 GHz + 4,116,085,529 instructions # 1.53 insn per cycle + 0.967020710 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.182406e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.087943e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.087943e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.187961e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.086286e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.086286e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.286151 sec - 9,910,806,255 cycles # 3.012 GHz - 25,456,031,111 instructions # 2.57 insn per cycle - 3.291763573 seconds time elapsed +TOTAL : 3.278046 sec + 9,891,835,516 cycles # 3.013 GHz + 25,457,241,379 instructions # 2.57 insn per cycle + 3.283538395 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.467752e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.800434e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.800434e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.461475e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.800212e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.800212e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.950518 sec - 8,946,482,743 cycles # 3.027 GHz - 21,514,123,834 instructions # 2.40 insn per cycle - 2.956056552 seconds time elapsed +TOTAL : 2.961448 sec + 8,958,054,464 cycles # 3.020 GHz + 21,514,605,384 instructions # 2.40 insn per cycle + 2.967091806 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.464134e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.723435e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.723435e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.449114e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.718886e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.718886e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.952533 sec - 8,633,003,733 cycles # 2.920 GHz - 15,829,431,121 instructions # 1.83 insn per cycle - 2.958100358 seconds time elapsed +TOTAL : 2.969121 sec + 8,647,101,919 cycles # 2.908 GHz + 15,830,093,651 instructions # 1.83 insn per cycle + 2.974697377 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.533505e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.859681e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.859681e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.514280e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.825562e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.825562e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.876122 sec - 8,428,640,196 cycles # 2.926 GHz - 15,527,735,744 instructions # 1.84 insn per cycle - 2.881608685 seconds time elapsed +TOTAL : 2.898480 sec + 8,435,230,503 cycles # 2.906 GHz + 15,528,950,884 instructions # 1.84 insn per cycle + 2.904204103 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.128966e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.008830e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.008830e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.166244e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.072345e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.072345e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.361119 sec - 7,560,312,259 cycles # 2.246 GHz - 14,293,668,051 instructions # 1.89 insn per cycle - 3.366622669 seconds time elapsed +TOTAL : 3.304157 sec + 7,572,571,500 cycles # 2.289 GHz + 14,293,792,931 instructions # 1.89 insn per cycle + 3.309751939 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 46e803358f..cfc01e370f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:01:23 +DATE: 2023-11-08_21:16:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.626199e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.328475e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.281681e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.506984e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.290770e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.275463e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.560646 sec - 2,313,886,918 cycles # 2.957 GHz - 3,567,705,327 instructions # 1.54 insn per cycle - 0.840116151 seconds time elapsed +TOTAL : 0.565965 sec + 2,321,819,505 cycles # 2.946 GHz + 3,610,558,250 instructions # 1.56 insn per cycle + 0.846354753 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.146010e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.358105e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.358105e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.127208e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.335415e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.335415e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.851033 sec - 17,813,996,987 cycles # 3.043 GHz - 43,616,814,202 instructions # 2.45 insn per cycle - 5.856069183 seconds time elapsed +TOTAL : 5.947738 sec + 17,831,603,454 cycles # 2.997 GHz + 43,615,812,813 instructions # 2.45 insn per cycle + 5.952849241 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.343466e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.599751e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.599751e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.344868e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.581929e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.581929e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.048613 sec - 9,276,606,540 cycles # 3.040 GHz - 21,930,294,042 instructions # 2.36 insn per cycle - 3.053688884 seconds time elapsed +TOTAL : 3.049781 sec + 9,255,993,248 cycles # 3.030 GHz + 21,926,767,970 instructions # 2.37 insn per cycle + 3.055067484 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.523694e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.872956e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.872956e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.528612e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.886098e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.886098e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.845518 sec - 8,308,772,789 cycles # 2.916 GHz - 15,593,301,532 instructions # 1.88 insn per cycle - 2.850623438 seconds time elapsed +TOTAL : 2.841538 sec + 8,310,122,274 cycles # 2.920 GHz + 15,590,852,784 instructions # 1.88 insn per cycle + 2.846613446 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2595) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.489948e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.840461e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.840461e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.544975e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.933439e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.933439e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.887357 sec - 8,231,785,355 cycles # 2.847 GHz - 15,437,944,905 instructions # 1.88 insn per cycle - 2.892363682 seconds time elapsed +TOTAL : 2.829740 sec + 8,228,769,997 cycles # 2.904 GHz + 15,439,791,314 instructions # 1.88 insn per cycle + 2.834839900 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.580760e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.973673e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.973673e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.468064e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.774733e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.774733e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.796324 sec - 6,629,287,981 cycles # 2.367 GHz - 12,873,018,117 instructions # 1.94 insn per cycle - 2.801456274 seconds time elapsed +TOTAL : 2.920266 sec + 6,654,443,055 cycles # 2.276 GHz + 12,870,591,658 instructions # 1.93 insn per cycle + 2.925460933 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1735) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index a12ca3b41d..b89c0950e0 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:34:47 +DATE: 2023-11-08_21:51:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.243102e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.475352e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.475352e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.262139e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.843159e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.843159e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.676327 sec - 5,681,132,328 cycles # 2.981 GHz - 10,328,752,116 instructions # 1.82 insn per cycle - 1.962251346 seconds time elapsed +TOTAL : 1.672229 sec + 5,680,712,756 cycles # 2.985 GHz + 10,249,439,391 instructions # 1.80 insn per cycle + 1.960159582 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.117341e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.320071e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.320071e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.122888e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.326457e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326457e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.103747 sec - 18,503,457,384 cycles # 3.029 GHz - 43,763,268,873 instructions # 2.37 insn per cycle - 6.109986471 seconds time elapsed +TOTAL : 6.070293 sec + 18,467,877,178 cycles # 3.040 GHz + 43,763,046,084 instructions # 2.37 insn per cycle + 6.076144883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.169781e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.246790e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.246790e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.241087e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.353707e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.353707e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.406148 sec - 10,026,239,155 cycles # 2.945 GHz - 23,264,915,776 instructions # 2.32 insn per cycle - 3.412744895 seconds time elapsed +TOTAL : 3.295191 sec + 10,020,961,358 cycles # 3.037 GHz + 23,261,304,628 instructions # 2.32 insn per cycle + 3.301360149 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.376931e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.582524e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.582524e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.364429e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.552712e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.552712e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.133404 sec - 9,115,108,969 cycles # 2.904 GHz - 16,712,850,458 instructions # 1.83 insn per cycle - 3.139765331 seconds time elapsed +TOTAL : 3.146782 sec + 9,058,696,000 cycles # 2.874 GHz + 16,711,646,468 instructions # 1.84 insn per cycle + 3.152847146 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2595) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.412136e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.649634e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.649634e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.299176e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.448559e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.448559e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.093398 sec - 9,015,171,302 cycles # 2.909 GHz - 16,559,247,945 instructions # 1.84 insn per cycle - 3.099791137 seconds time elapsed +TOTAL : 3.242101 sec + 8,995,544,368 cycles # 2.776 GHz + 16,559,826,795 instructions # 1.84 insn per cycle + 3.248399630 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.406219e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.608241e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.608241e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.425438e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.624655e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.624655e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.106272 sec - 7,475,444,541 cycles # 2.404 GHz - 14,076,958,110 instructions # 1.88 insn per cycle - 3.112522018 seconds time elapsed +TOTAL : 3.082964 sec + 7,440,102,740 cycles # 2.410 GHz + 14,077,595,444 instructions # 1.89 insn per cycle + 3.089018136 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1735) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index e12a7cff38..a9a0d75eb2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:47:49 +DATE: 2023-11-08_22:04:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.309547e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.164321e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.211559e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.383746e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.209904e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.237350e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.178788 sec - 4,175,363,575 cycles # 2.986 GHz - 6,687,157,832 instructions # 1.60 insn per cycle - 1.455561692 seconds time elapsed +TOTAL : 1.160269 sec + 4,203,267,927 cycles # 3.027 GHz + 6,686,907,403 instructions # 1.59 insn per cycle + 1.447760091 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.139229e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.352216e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.352216e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.159461e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.377089e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.377089e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.211284 sec - 18,855,190,279 cycles # 3.034 GHz - 43,795,517,542 instructions # 2.32 insn per cycle - 6.216374296 seconds time elapsed +TOTAL : 6.100201 sec + 18,832,208,439 cycles # 3.085 GHz + 43,796,080,670 instructions # 2.33 insn per cycle + 6.105246671 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.318674e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.546898e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.546898e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.360687e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.606052e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.606052e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.402195 sec - 10,237,833,782 cycles # 3.006 GHz - 22,007,212,368 instructions # 2.15 insn per cycle - 3.407333694 seconds time elapsed +TOTAL : 3.340864 sec + 10,252,717,994 cycles # 3.065 GHz + 22,009,397,675 instructions # 2.15 insn per cycle + 3.349625818 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.476676e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.816143e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.816143e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.544336e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.928692e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.928692e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.234448 sec - 9,334,268,427 cycles # 2.883 GHz - 15,503,242,414 instructions # 1.66 insn per cycle - 3.239539945 seconds time elapsed +TOTAL : 3.145870 sec + 9,340,548,482 cycles # 2.966 GHz + 15,504,284,674 instructions # 1.66 insn per cycle + 3.151101472 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2595) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.532354e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.931778e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.931778e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.556429e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.968460e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.968460e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.179353 sec - 9,298,076,707 cycles # 2.921 GHz - 15,144,691,612 instructions # 1.63 insn per cycle - 3.184641880 seconds time elapsed +TOTAL : 3.140902 sec + 9,274,295,743 cycles # 2.952 GHz + 15,151,601,553 instructions # 1.63 insn per cycle + 3.145942426 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.550309e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.928739e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.928739e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.615564e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.042980e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.042980e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.163394 sec - 7,678,426,346 cycles # 2.424 GHz - 12,579,409,911 instructions # 1.64 insn per cycle - 3.168501704 seconds time elapsed +TOTAL : 3.083778 sec + 7,670,760,165 cycles # 2.484 GHz + 12,580,664,280 instructions # 1.64 insn per cycle + 3.088953388 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1735) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index ed97b2f8ed..e8e5add4c9 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:44:30 +DATE: 2023-11-08_22:00:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.311918e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184761e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.263047e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.391545e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.217605e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.255851e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.849658 sec - 3,163,783,620 cycles # 2.955 GHz - 6,425,624,965 instructions # 2.03 insn per cycle - 1.127772989 seconds time elapsed +TOTAL : 0.834564 sec + 3,199,482,421 cycles # 3.039 GHz + 6,490,454,019 instructions # 2.03 insn per cycle + 1.111753408 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.132012e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.344208e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.344208e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.150543e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.366095e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.366095e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.993383 sec - 18,094,070,839 cycles # 3.017 GHz - 43,613,404,695 instructions # 2.41 insn per cycle - 5.998406050 seconds time elapsed +TOTAL : 5.833467 sec + 17,826,844,076 cycles # 3.054 GHz + 43,615,420,578 instructions # 2.45 insn per cycle + 5.838895279 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.281067e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.486158e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.486158e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.337314e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.571728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.571728e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.130477 sec - 9,257,197,715 cycles # 2.953 GHz - 21,925,291,921 instructions # 2.37 insn per cycle - 3.135663717 seconds time elapsed +TOTAL : 3.054886 sec + 9,243,837,324 cycles # 3.022 GHz + 21,925,827,754 instructions # 2.37 insn per cycle + 3.060063052 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.526300e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.881905e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.881905e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.568595e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.965452e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.965452e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.846007 sec - 8,323,404,187 cycles # 2.920 GHz - 15,589,367,643 instructions # 1.87 insn per cycle - 2.851124263 seconds time elapsed +TOTAL : 2.797209 sec + 8,337,217,151 cycles # 2.976 GHz + 15,590,584,627 instructions # 1.87 insn per cycle + 2.802297250 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2595) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.559394e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.951403e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.951403e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.613887e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.042160e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.042160e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.815665 sec - 8,248,875,592 cycles # 2.925 GHz - 15,439,478,624 instructions # 1.87 insn per cycle - 2.820889860 seconds time elapsed +TOTAL : 2.753850 sec + 8,236,246,865 cycles # 2.988 GHz + 15,440,580,051 instructions # 1.87 insn per cycle + 2.758988038 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.553964e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.948928e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.948928e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.649804e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.085948e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.085948e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.827281 sec - 6,687,814,053 cycles # 2.363 GHz - 12,869,763,437 instructions # 1.92 insn per cycle - 2.832592565 seconds time elapsed +TOTAL : 2.730536 sec + 6,628,841,045 cycles # 2.424 GHz + 12,869,136,387 instructions # 1.94 insn per cycle + 2.735524791 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1735) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index c7d745ef4d..4353a0323c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:41:12 +DATE: 2023-11-08_21:57:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.077097e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.138341e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.120075e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.439872e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.182276e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.152259e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.480161 sec - 5,077,584,264 cycles # 2.967 GHz - 9,258,149,444 instructions # 1.82 insn per cycle - 1.768271684 seconds time elapsed +TOTAL : 1.433462 sec + 5,039,023,930 cycles # 3.052 GHz + 9,234,566,396 instructions # 1.83 insn per cycle + 1.710073871 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.142005e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.354012e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.354012e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.165155e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.381805e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.381805e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.874856 sec - 17,835,700,462 cycles # 3.034 GHz - 43,613,540,806 instructions # 2.45 insn per cycle - 5.879931479 seconds time elapsed +TOTAL : 5.755055 sec + 17,830,794,091 cycles # 3.096 GHz + 43,613,836,777 instructions # 2.45 insn per cycle + 5.760227416 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.282759e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.491220e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.491220e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.340707e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.569922e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.569922e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.129132 sec - 9,269,728,355 cycles # 2.963 GHz - 21,928,484,188 instructions # 2.37 insn per cycle - 3.134244707 seconds time elapsed +TOTAL : 3.052308 sec + 9,235,069,524 cycles # 3.022 GHz + 21,925,950,370 instructions # 2.37 insn per cycle + 3.057391403 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.516560e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.868004e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.868004e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.565429e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.942662e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.942662e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.857533 sec - 8,336,241,805 cycles # 2.913 GHz - 15,589,958,795 instructions # 1.87 insn per cycle - 2.862709487 seconds time elapsed +TOTAL : 2.806664 sec + 8,327,245,678 cycles # 2.963 GHz + 15,591,035,358 instructions # 1.87 insn per cycle + 2.811768123 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2595) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.536616e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.924197e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.924197e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.574877e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.971987e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.971987e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.838427 sec - 8,267,692,084 cycles # 2.908 GHz - 15,438,877,256 instructions # 1.87 insn per cycle - 2.843475918 seconds time elapsed +TOTAL : 2.795905 sec + 8,237,659,186 cycles # 2.942 GHz + 15,439,551,856 instructions # 1.87 insn per cycle + 2.800978610 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.539393e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.905150e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.905150e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.627739e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.061419e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.061419e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.843291 sec - 6,667,785,493 cycles # 2.342 GHz - 12,868,798,226 instructions # 1.93 insn per cycle - 2.848396098 seconds time elapsed +TOTAL : 2.749607 sec + 6,653,390,801 cycles # 2.416 GHz + 12,870,556,050 instructions # 1.93 insn per cycle + 2.754896991 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1735) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 2a5177092e..4a8bf7a45a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:01:53 +DATE: 2023-11-08_21:16:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.628396e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.344836e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.322116e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.504004e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.299164e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.301394e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.558495 sec - 2,344,289,295 cycles # 2.966 GHz - 3,579,154,611 instructions # 1.53 insn per cycle - 0.847997464 seconds time elapsed +TOTAL : 0.565713 sec + 2,319,019,949 cycles # 2.949 GHz + 3,628,185,594 instructions # 1.56 insn per cycle + 0.846311751 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.195436e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.435503e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.435503e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.206183e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.450008e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.450008e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.634613 sec - 16,757,667,455 cycles # 2.972 GHz - 41,375,848,460 instructions # 2.47 insn per cycle - 5.639688103 seconds time elapsed +TOTAL : 5.582273 sec + 16,756,629,307 cycles # 2.999 GHz + 41,373,009,702 instructions # 2.47 insn per cycle + 5.587382956 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.409189e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.740073e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.740073e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.401015e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.738811e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.738811e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.974456 sec - 9,031,167,153 cycles # 3.032 GHz - 21,234,204,961 instructions # 2.35 insn per cycle - 2.979655809 seconds time elapsed +TOTAL : 2.986422 sec + 9,012,092,925 cycles # 3.013 GHz + 21,229,937,185 instructions # 2.36 insn per cycle + 2.991621252 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1841) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.541260e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.926631e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.926631e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.541320e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.913153e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.913153e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.832126 sec - 8,284,857,543 cycles # 2.922 GHz - 15,430,300,133 instructions # 1.86 insn per cycle - 2.837298063 seconds time elapsed +TOTAL : 2.831515 sec + 8,274,365,196 cycles # 2.917 GHz + 15,424,948,763 instructions # 1.86 insn per cycle + 2.836960602 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2536) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.592912e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.031163e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.031163e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.599740e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.051139e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.051139e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.778473 sec - 8,124,076,124 cycles # 2.921 GHz - 15,242,043,085 instructions # 1.88 insn per cycle - 2.783650122 seconds time elapsed +TOTAL : 2.773779 sec + 8,126,258,677 cycles # 2.925 GHz + 15,238,451,861 instructions # 1.88 insn per cycle + 2.778950300 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.583024e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.982786e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.982786e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.571238e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.958685e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.958685e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.793855 sec - 6,612,725,918 cycles # 2.363 GHz - 12,851,623,569 instructions # 1.94 insn per cycle - 2.799020549 seconds time elapsed +TOTAL : 2.804796 sec + 6,629,701,677 cycles # 2.360 GHz + 12,848,530,488 instructions # 1.94 insn per cycle + 2.809910943 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1705) (512y: 18) (512z: 1427) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index b5507320b6..b8155a680e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:24:23 +DATE: 2023-11-08_21:40:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.295762e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181123e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.251991e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.302615e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188065e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.274309e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.577748 sec - 2,371,472,909 cycles # 2.938 GHz - 3,662,215,838 instructions # 1.54 insn per cycle - 0.866645313 seconds time elapsed +TOTAL : 0.574549 sec + 2,352,849,250 cycles # 2.917 GHz + 3,649,350,219 instructions # 1.55 insn per cycle + 0.863978578 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.709669e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.230063e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.230063e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.686060e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.194010e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.194010e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.043238 sec - 12,201,253,013 cycles # 3.016 GHz - 32,520,928,331 instructions # 2.67 insn per cycle - 4.048480591 seconds time elapsed +TOTAL : 4.097911 sec + 12,184,788,464 cycles # 2.970 GHz + 32,521,623,255 instructions # 2.67 insn per cycle + 4.103328943 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.776736e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.688717e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.688717e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.770837e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.689962e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.689962e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.629894 sec - 8,006,523,859 cycles # 3.039 GHz - 18,689,561,969 instructions # 2.33 insn per cycle - 2.635155805 seconds time elapsed +TOTAL : 2.634890 sec + 7,998,179,733 cycles # 3.030 GHz + 18,690,180,922 instructions # 2.34 insn per cycle + 2.640235037 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1554) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.876319e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.776118e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.776118e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.861879e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.750654e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.750654e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.544972 sec - 7,483,863,921 cycles # 2.935 GHz - 14,252,784,118 instructions # 1.90 insn per cycle - 2.550249205 seconds time elapsed +TOTAL : 2.559375 sec + 7,467,736,067 cycles # 2.913 GHz + 14,255,217,150 instructions # 1.91 insn per cycle + 2.564904201 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.940665e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.960644e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.960644e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.908800e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.910304e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.910304e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.495422 sec - 7,326,781,172 cycles # 2.931 GHz - 13,945,833,508 instructions # 1.90 insn per cycle - 2.500698244 seconds time elapsed +TOTAL : 2.522982 sec + 7,364,286,769 cycles # 2.913 GHz + 13,952,625,236 instructions # 1.89 insn per cycle + 2.528348787 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.636740e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.108198e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.108198e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.584257e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.006941e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.006941e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.746264 sec - 6,527,138,912 cycles # 2.373 GHz - 13,421,028,013 instructions # 2.06 insn per cycle - 2.751679406 seconds time elapsed +TOTAL : 2.801165 sec + 6,529,127,011 cycles # 2.327 GHz + 13,421,836,325 instructions # 2.06 insn per cycle + 2.806446897 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index b6c42e0895..385ce72d78 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:24:50 +DATE: 2023-11-08_21:41:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.300995e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.194789e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.295764e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.304320e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.197410e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.300141e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.573687 sec - 2,396,122,888 cycles # 2.957 GHz - 3,709,386,643 instructions # 1.55 insn per cycle - 0.867525381 seconds time elapsed +TOTAL : 0.574067 sec + 2,385,415,994 cycles # 2.943 GHz + 3,655,710,101 instructions # 1.53 insn per cycle + 0.868231647 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.274435e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.306451e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.306451e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.254695e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.267118e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.267118e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.128769 sec - 9,423,056,878 cycles # 3.008 GHz - 25,306,341,141 instructions # 2.69 insn per cycle - 3.134038482 seconds time elapsed +TOTAL : 3.154781 sec + 9,423,263,848 cycles # 2.983 GHz + 25,307,020,372 instructions # 2.69 insn per cycle + 3.160042496 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.099658e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.759584e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.759584e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.134634e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.819272e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.819272e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.397339 sec - 7,201,211,606 cycles # 2.998 GHz - 16,901,413,977 instructions # 2.35 insn per cycle - 2.402789017 seconds time elapsed +TOTAL : 2.372030 sec + 7,183,608,233 cycles # 3.022 GHz + 16,901,599,192 instructions # 2.35 insn per cycle + 2.377377295 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1359) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.019910e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.199492e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.199492e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.035295e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.215553e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.215553e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.443323 sec - 7,147,435,963 cycles # 2.920 GHz - 13,619,110,670 instructions # 1.91 insn per cycle - 2.448969091 seconds time elapsed +TOTAL : 2.433519 sec + 7,141,153,744 cycles # 2.929 GHz + 13,619,130,373 instructions # 1.91 insn per cycle + 2.438958453 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.050148e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.307582e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.307582e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.071324e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.326333e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.326333e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.423418 sec - 7,082,396,314 cycles # 2.918 GHz - 13,431,226,521 instructions # 1.90 insn per cycle - 2.429141482 seconds time elapsed +TOTAL : 2.408738 sec + 7,063,825,257 cycles # 2.927 GHz + 13,435,596,499 instructions # 1.90 insn per cycle + 2.414135887 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.725279e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.338904e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.338904e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.750195e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.390595e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.390595e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.669392 sec - 6,366,623,257 cycles # 2.381 GHz - 13,153,230,984 instructions # 2.07 insn per cycle - 2.674848562 seconds time elapsed +TOTAL : 2.646969 sec + 6,340,373,316 cycles # 2.391 GHz + 13,154,077,274 instructions # 2.07 insn per cycle + 2.652485679 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 40be1e0fe4..a176ffc4e4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:02:23 +DATE: 2023-11-08_21:17:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.986561e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.920506e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.026737e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.486918e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.802792e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.976330e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.651585 sec - 2,613,210,290 cycles # 2.977 GHz - 4,026,633,947 instructions # 1.54 insn per cycle - 0.940304085 seconds time elapsed +TOTAL : 0.656484 sec + 2,625,682,009 cycles # 2.960 GHz + 4,099,364,380 instructions # 1.56 insn per cycle + 0.946865269 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.098312e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.283308e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.283308e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.091320e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.274850e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274850e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.126587 sec - 18,732,621,094 cycles # 3.056 GHz - 44,288,636,649 instructions # 2.36 insn per cycle - 6.131702524 seconds time elapsed +TOTAL : 6.167326 sec + 18,738,979,619 cycles # 3.037 GHz + 44,287,346,211 instructions # 2.36 insn per cycle + 6.172563885 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.724748e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.279623e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.279623e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.716365e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.273883e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.273883e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.052368 sec - 12,345,078,225 cycles # 3.044 GHz - 30,962,385,061 instructions # 2.51 insn per cycle - 4.057665704 seconds time elapsed +TOTAL : 4.065766 sec + 12,369,623,289 cycles # 3.039 GHz + 30,960,892,415 instructions # 2.50 insn per cycle + 4.071137873 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.012805e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.801799e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.801799e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.040246e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.832671e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.832671e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.527503 sec - 10,105,777,222 cycles # 2.861 GHz - 19,402,091,411 instructions # 1.92 insn per cycle - 3.532885933 seconds time elapsed +TOTAL : 3.479287 sec + 10,114,657,367 cycles # 2.903 GHz + 19,400,067,612 instructions # 1.92 insn per cycle + 3.484811762 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.136223e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.011490e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.011490e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.136561e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.021650e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.021650e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.337554 sec - 9,780,270,182 cycles # 2.927 GHz - 18,984,447,401 instructions # 1.94 insn per cycle - 3.342834380 seconds time elapsed +TOTAL : 3.335937 sec + 9,745,210,637 cycles # 2.917 GHz + 18,969,865,366 instructions # 1.95 insn per cycle + 3.341324685 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1859) (512y: 188) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.916274e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.582982e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.582982e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.846714e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.476604e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.476604e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.678279 sec - 8,374,553,290 cycles # 2.274 GHz - 15,066,979,076 instructions # 1.80 insn per cycle - 3.683518796 seconds time elapsed +TOTAL : 3.810646 sec + 8,364,453,052 cycles # 2.192 GHz + 15,065,277,596 instructions # 1.80 insn per cycle + 3.816111336 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 155) (512z: 1316) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index d0448f95d2..257a2b14eb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-03_19:02:57 +DATE: 2023-11-08_21:17:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.995389e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.942657e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.069355e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.517340e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.835074e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.047913e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.648218 sec - 2,577,449,374 cycles # 2.937 GHz - 3,930,119,139 instructions # 1.52 insn per cycle - 0.934838617 seconds time elapsed +TOTAL : 0.657459 sec + 2,634,612,924 cycles # 2.971 GHz + 4,038,430,114 instructions # 1.53 insn per cycle + 0.947276631 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.138539e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.340756e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.340756e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.135032e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.337803e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.337803e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.930169 sec - 17,940,598,550 cycles # 3.023 GHz - 42,539,439,563 instructions # 2.37 insn per cycle - 5.935391018 seconds time elapsed +TOTAL : 5.948882 sec + 17,974,083,702 cycles # 3.020 GHz + 42,538,758,836 instructions # 2.37 insn per cycle + 5.954247483 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.737380e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.320541e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.320541e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.746148e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.320939e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.320939e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.022351 sec - 12,179,829,023 cycles # 3.025 GHz - 30,269,422,152 instructions # 2.49 insn per cycle - 4.027705928 seconds time elapsed +TOTAL : 4.005000 sec + 12,179,888,264 cycles # 3.038 GHz + 30,267,022,025 instructions # 2.48 insn per cycle + 4.010444441 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.003006e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.791277e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.791277e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.065337e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.877404e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.877404e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.544763 sec - 10,086,483,930 cycles # 2.843 GHz - 19,285,075,836 instructions # 1.91 insn per cycle - 3.550049339 seconds time elapsed +TOTAL : 3.440250 sec + 10,026,177,275 cycles # 2.911 GHz + 19,281,771,933 instructions # 1.92 insn per cycle + 3.445652030 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2162) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.153713e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.048947e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.048947e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.165158e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.064737e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.064737e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.313722 sec - 9,652,564,948 cycles # 2.909 GHz - 18,773,850,855 instructions # 1.94 insn per cycle - 3.319022077 seconds time elapsed +TOTAL : 3.297369 sec + 9,639,905,003 cycles # 2.920 GHz + 18,781,958,033 instructions # 1.95 insn per cycle + 3.302769757 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1833) (512y: 191) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.911178e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.576380e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.576380e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.925761e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.602996e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.602996e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.691490 sec - 8,274,258,282 cycles # 2.239 GHz - 14,991,882,108 instructions # 1.81 insn per cycle - 3.696773496 seconds time elapsed +TOTAL : 3.664817 sec + 8,281,446,223 cycles # 2.257 GHz + 14,988,620,827 instructions # 1.81 insn per cycle + 3.670422107 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1020) (512y: 156) (512z: 1305) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index ecfe1f9032..06ab23436d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:03:30 +DATE: 2023-11-08_21:18:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.269149e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.178306e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270483e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.051243e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.169781e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269231e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.515028 sec - 2,190,362,135 cycles # 2.945 GHz - 3,134,430,746 instructions # 1.43 insn per cycle - 0.801320986 seconds time elapsed +TOTAL : 0.513968 sec + 2,206,571,631 cycles # 2.965 GHz + 3,147,975,302 instructions # 1.43 insn per cycle + 0.801145911 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.141790e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204663e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204663e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.149781e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.212668e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.212668e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.001947 sec - 15,160,921,453 cycles # 3.029 GHz - 38,440,320,018 instructions # 2.54 insn per cycle - 5.007262329 seconds time elapsed +TOTAL : 4.981998 sec + 15,156,593,836 cycles # 3.040 GHz + 38,437,072,823 instructions # 2.54 insn per cycle + 4.987299145 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.537912e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.729582e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.729582e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.640780e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.838553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.838553e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.070180 sec - 9,135,564,109 cycles # 2.971 GHz - 24,595,068,911 instructions # 2.69 insn per cycle - 3.075510770 seconds time elapsed +TOTAL : 2.985566 sec + 9,095,215,674 cycles # 3.042 GHz + 24,591,174,592 instructions # 2.70 insn per cycle + 2.991001875 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2156) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.794659e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.298456e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.298456e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.834785e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.339543e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.339543e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.915155 sec - 5,488,800,341 cycles # 2.860 GHz - 11,269,289,809 instructions # 2.05 insn per cycle - 1.920562747 seconds time elapsed +TOTAL : 1.901490 sec + 5,454,837,265 cycles # 2.862 GHz + 11,265,546,477 instructions # 2.07 insn per cycle + 1.907039068 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.465243e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.099655e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.099655e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.372557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.993390e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.993390e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.726047 sec - 4,948,464,581 cycles # 2.859 GHz - 10,575,268,094 instructions # 2.14 insn per cycle - 1.731560491 seconds time elapsed +TOTAL : 1.751887 sec + 4,963,717,675 cycles # 2.826 GHz + 10,572,023,161 instructions # 2.13 insn per cycle + 1.757527600 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.977744e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.204839e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.204839e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.939400e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.168716e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.168716e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.740172 sec - 5,379,659,738 cycles # 1.960 GHz - 7,808,789,832 instructions # 1.45 insn per cycle - 2.745493260 seconds time elapsed +TOTAL : 2.769882 sec + 5,377,512,872 cycles # 1.939 GHz + 7,806,286,911 instructions # 1.45 insn per cycle + 2.775553290 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index dd2f256477..8de158cb65 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:35:20 +DATE: 2023-11-08_21:51:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.496633e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.880527e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.880527e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.592700e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.008872e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.008872e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.808083 sec - 3,120,895,454 cycles # 2.971 GHz - 4,726,889,577 instructions # 1.51 insn per cycle - 1.107972527 seconds time elapsed +TOTAL : 0.804684 sec + 3,099,147,756 cycles # 2.967 GHz + 4,823,816,385 instructions # 1.56 insn per cycle + 1.102344703 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.117962e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.179706e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.179706e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.051112e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.111963e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.111963e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.135527 sec - 15,504,544,823 cycles # 3.016 GHz - 38,497,224,440 instructions # 2.48 insn per cycle - 5.142229259 seconds time elapsed +TOTAL : 5.297825 sec + 15,481,852,434 cycles # 2.919 GHz + 38,496,050,546 instructions # 2.49 insn per cycle + 5.304382607 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.595756e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.790745e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.790745e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.421539e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.610351e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.610351e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.098715 sec - 9,432,801,004 cycles # 3.038 GHz - 24,773,895,780 instructions # 2.63 insn per cycle - 3.105439323 seconds time elapsed +TOTAL : 3.252273 sec + 9,439,657,096 cycles # 2.897 GHz + 24,775,783,847 instructions # 2.62 insn per cycle + 3.259008663 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2156) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.527781e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.981315e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.981315e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.465972e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.935898e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.935898e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.098555 sec - 5,826,323,105 cycles # 2.789 GHz - 11,554,423,664 instructions # 1.98 insn per cycle - 2.105206679 seconds time elapsed +TOTAL : 2.107608 sec + 5,817,196,530 cycles # 2.752 GHz + 11,552,661,145 instructions # 1.99 insn per cycle + 2.114326410 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.300396e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.893264e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.893264e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.009635e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.580924e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.580924e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.849117 sec - 5,294,307,248 cycles # 2.854 GHz - 10,856,382,305 instructions # 2.05 insn per cycle - 1.855861110 seconds time elapsed +TOTAL : 1.934696 sec + 5,303,416,333 cycles # 2.735 GHz + 10,861,487,391 instructions # 2.05 insn per cycle + 1.941424882 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.891057e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.111611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.111611e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.701730e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.912869e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.912869e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.882235 sec - 5,742,873,090 cycles # 1.988 GHz - 8,048,787,968 instructions # 1.40 insn per cycle - 2.889049440 seconds time elapsed +TOTAL : 3.025583 sec + 5,727,782,590 cycles # 1.894 GHz + 8,052,158,492 instructions # 1.41 insn per cycle + 3.032424174 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 70c42f96ca..fc433be1ef 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:48:21 +DATE: 2023-11-08_22:04:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.579966e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154296e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270387e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.726172e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.159376e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270269e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.619804 sec - 2,500,171,473 cycles # 2.947 GHz - 3,610,462,854 instructions # 1.44 insn per cycle - 0.906022247 seconds time elapsed +TOTAL : 0.626000 sec + 2,413,951,090 cycles # 2.822 GHz + 3,508,959,445 instructions # 1.45 insn per cycle + 0.913280230 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.141469e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204103e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204103e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.182990e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.247369e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247369e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.061672 sec - 15,345,417,554 cycles # 3.029 GHz - 38,452,483,858 instructions # 2.51 insn per cycle - 5.067127392 seconds time elapsed +TOTAL : 4.967265 sec + 15,332,653,861 cycles # 3.084 GHz + 38,452,810,595 instructions # 2.51 insn per cycle + 4.972510854 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.594441e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.787517e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.787517e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.695457e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.898409e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.898409e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.081938 sec - 9,306,122,505 cycles # 3.015 GHz - 24,590,602,612 instructions # 2.64 insn per cycle - 3.087467598 seconds time elapsed +TOTAL : 2.999548 sec + 9,281,583,975 cycles # 3.090 GHz + 24,591,762,393 instructions # 2.65 insn per cycle + 3.004985897 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2156) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.780444e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.284766e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.284766e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.871319e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.385365e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.385365e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.978919 sec - 5,659,108,727 cycles # 2.853 GHz - 11,248,307,846 instructions # 1.99 insn per cycle - 1.984493875 seconds time elapsed +TOTAL : 1.950157 sec + 5,690,984,261 cycles # 2.911 GHz + 11,247,762,981 instructions # 1.98 insn per cycle + 1.955461495 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.409554e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.043503e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.043503e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.503413e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.137413e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.137413e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.801971 sec - 5,131,678,035 cycles # 2.841 GHz - 10,518,217,961 instructions # 2.05 insn per cycle - 1.807387516 seconds time elapsed +TOTAL : 1.776614 sec + 5,148,876,403 cycles # 2.891 GHz + 10,521,901,939 instructions # 2.04 insn per cycle + 1.781976606 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.952294e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.178919e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178919e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.075607e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.312212e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.312212e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.820832 sec - 5,565,619,645 cycles # 1.970 GHz - 7,754,617,723 instructions # 1.39 insn per cycle - 2.826352548 seconds time elapsed +TOTAL : 2.736817 sec + 5,563,466,882 cycles # 2.030 GHz + 7,754,129,949 instructions # 1.39 insn per cycle + 2.742022793 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 4837b41444..f949e08a8e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:45:01 +DATE: 2023-11-08_22:01:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.583777e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154968e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271096e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.746837e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.161251e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269946e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.557101 sec - 2,322,977,037 cycles # 2.953 GHz - 3,599,423,025 instructions # 1.55 insn per cycle - 0.843882316 seconds time elapsed +TOTAL : 0.546588 sec + 2,339,106,527 cycles # 3.024 GHz + 3,639,530,742 instructions # 1.56 insn per cycle + 0.830477401 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.134010e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.196717e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.196717e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.194821e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.259419e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.259419e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.022340 sec - 15,161,844,495 cycles # 3.017 GHz - 38,436,020,868 instructions # 2.54 insn per cycle - 5.028057319 seconds time elapsed +TOTAL : 4.881299 sec + 15,162,215,504 cycles # 3.104 GHz + 38,436,564,546 instructions # 2.54 insn per cycle + 4.886593937 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.611425e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.807723e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.807723e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.717533e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.921164e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.921164e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.009043 sec - 9,092,248,013 cycles # 3.018 GHz - 24,590,993,356 instructions # 2.70 insn per cycle - 3.014816078 seconds time elapsed +TOTAL : 2.924290 sec + 9,098,563,572 cycles # 3.107 GHz + 24,592,229,111 instructions # 2.70 insn per cycle + 2.929612410 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2156) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.765157e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.263695e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.263695e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.896966e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.423160e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.423160e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.924911 sec - 5,492,799,049 cycles # 2.847 GHz - 11,264,994,094 instructions # 2.05 insn per cycle - 1.930399853 seconds time elapsed +TOTAL : 1.883509 sec + 5,473,701,924 cycles # 2.899 GHz + 11,265,098,305 instructions # 2.06 insn per cycle + 1.888826353 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.461458e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.086226e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.086226e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.333944e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.936194e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.936194e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.728063 sec - 4,951,669,022 cycles # 2.858 GHz - 10,569,075,843 instructions # 2.13 insn per cycle - 1.733593807 seconds time elapsed +TOTAL : 1.759678 sec + 4,959,739,230 cycles # 2.811 GHz + 10,570,009,461 instructions # 2.13 insn per cycle + 1.765083600 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.938989e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.163796e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.163796e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.108089e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.344532e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.344532e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.768049 sec - 5,404,539,268 cycles # 1.950 GHz - 7,804,733,779 instructions # 1.44 insn per cycle - 2.773480694 seconds time elapsed +TOTAL : 2.655128 sec + 5,388,561,520 cycles # 2.026 GHz + 7,804,959,196 instructions # 1.45 insn per cycle + 2.660471194 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 04f32ac3bc..6c72f6887e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:41:43 +DATE: 2023-11-08_21:58:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.845624e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154000e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.267501e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.993868e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158186e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.266776e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.705622 sec - 2,764,377,825 cycles # 2.955 GHz - 4,322,445,800 instructions # 1.56 insn per cycle - 0.992638570 seconds time elapsed +TOTAL : 0.697399 sec + 2,787,983,765 cycles # 3.019 GHz + 4,369,945,413 instructions # 1.57 insn per cycle + 0.982292174 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.118266e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.179189e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.179189e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.151791e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.213814e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.213814e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.055003 sec - 15,355,352,228 cycles # 3.035 GHz - 38,436,037,499 instructions # 2.50 insn per cycle - 5.060369145 seconds time elapsed +TOTAL : 4.977336 sec + 15,184,395,969 cycles # 3.048 GHz + 38,438,963,256 instructions # 2.53 insn per cycle + 4.982648512 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.619308e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.814626e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.814626e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.705404e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.908313e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.908313e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.002993 sec - 9,098,824,080 cycles # 3.025 GHz - 24,590,228,698 instructions # 2.70 insn per cycle - 3.008485414 seconds time elapsed +TOTAL : 2.933004 sec + 9,125,855,621 cycles # 3.107 GHz + 24,590,801,711 instructions # 2.69 insn per cycle + 2.938291037 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2156) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.738465e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.252767e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.252767e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.720849e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.210353e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.210353e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.934521 sec - 5,491,674,204 cycles # 2.833 GHz - 11,265,170,941 instructions # 2.05 insn per cycle - 1.939950087 seconds time elapsed +TOTAL : 1.938623 sec + 5,466,827,554 cycles # 2.814 GHz + 11,265,438,862 instructions # 2.06 insn per cycle + 1.943823759 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.341479e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.957193e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.957193e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.635980e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.287954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.287954e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.759206 sec - 4,958,873,003 cycles # 2.811 GHz - 10,570,272,367 instructions # 2.13 insn per cycle - 1.764825335 seconds time elapsed +TOTAL : 1.682323 sec + 4,955,566,146 cycles # 2.937 GHz + 10,571,524,775 instructions # 2.13 insn per cycle + 1.687724736 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.934828e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.158501e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.158501e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.091835e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.326386e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.326386e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.772888 sec - 5,409,288,056 cycles # 1.948 GHz - 7,806,084,388 instructions # 1.44 insn per cycle - 2.778257755 seconds time elapsed +TOTAL : 2.665788 sec + 5,400,449,096 cycles # 2.023 GHz + 7,805,014,579 instructions # 1.45 insn per cycle + 2.671129758 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 4e3b221e19..3a0f520dcc 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:03:57 +DATE: 2023-11-08_21:18:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.258167e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.174363e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.266024e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.048585e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168286e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265645e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.515882 sec - 2,147,525,845 cycles # 2.877 GHz - 3,086,933,024 instructions # 1.44 insn per cycle - 0.803849250 seconds time elapsed +TOTAL : 0.515938 sec + 2,194,564,244 cycles # 2.948 GHz + 3,170,767,882 instructions # 1.44 insn per cycle + 0.803319972 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.170531e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.234097e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.234097e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.145803e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.208726e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.208726e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.935479 sec - 15,016,135,362 cycles # 3.040 GHz - 40,166,123,209 instructions # 2.67 insn per cycle - 4.940913654 seconds time elapsed +TOTAL : 4.991935 sec + 15,019,527,641 cycles # 3.006 GHz + 40,165,389,576 instructions # 2.67 insn per cycle + 4.997467241 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.815308e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.035943e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.035943e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.795270e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.015877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.015877e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.853658 sec - 8,679,305,567 cycles # 3.037 GHz - 23,688,803,932 instructions # 2.73 insn per cycle - 2.859362026 seconds time elapsed +TOTAL : 2.867596 sec + 8,671,075,725 cycles # 3.019 GHz + 23,683,669,849 instructions # 2.73 insn per cycle + 2.873212548 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2069) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.201194e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.599502e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.599502e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.180539e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.583447e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.583447e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.119971 sec - 6,076,924,812 cycles # 2.860 GHz - 13,078,281,182 instructions # 2.15 insn per cycle - 2.125352086 seconds time elapsed +TOTAL : 2.128793 sec + 6,072,650,571 cycles # 2.846 GHz + 13,074,915,373 instructions # 2.15 insn per cycle + 2.134316674 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.478450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.920522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.920522e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.449593e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.890564e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.890564e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.017570 sec - 5,787,274,892 cycles # 2.862 GHz - 12,336,105,279 instructions # 2.13 insn per cycle - 2.023012261 seconds time elapsed +TOTAL : 2.028925 sec + 5,794,294,617 cycles # 2.851 GHz + 12,335,132,296 instructions # 2.13 insn per cycle + 2.034385767 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 294) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.519779e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.701184e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.701184e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.645486e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.838740e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.838740e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.086221 sec - 5,817,765,621 cycles # 1.888 GHz - 9,621,068,231 instructions # 1.65 insn per cycle - 3.091564620 seconds time elapsed +TOTAL : 2.982084 sec + 5,814,493,383 cycles # 1.947 GHz + 9,613,724,456 instructions # 1.65 insn per cycle + 2.987600867 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1510) (512y: 209) (512z: 1971) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 3337c01ad4..1cbf67a236 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:25:16 +DATE: 2023-11-08_21:41:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.554755e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155174e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.268743e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.595048e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.160670e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269203e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.526687 sec - 2,250,994,801 cycles # 2.926 GHz - 3,097,737,524 instructions # 1.38 insn per cycle - 0.826717654 seconds time elapsed +TOTAL : 0.521954 sec + 2,216,810,301 cycles # 2.935 GHz + 3,140,499,783 instructions # 1.42 insn per cycle + 0.812101303 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.473532e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.556761e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.556761e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.505174e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.591402e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.591402e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.344783 sec - 13,019,193,404 cycles # 2.993 GHz - 34,405,663,599 instructions # 2.64 insn per cycle - 4.350365607 seconds time elapsed +TOTAL : 4.291146 sec + 13,017,199,090 cycles # 3.030 GHz + 34,406,598,887 instructions # 2.64 insn per cycle + 4.296733375 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.104680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.249620e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.249620e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.106755e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.249963e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.249963e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.483866 sec - 10,607,531,951 cycles # 3.041 GHz - 24,022,392,993 instructions # 2.26 insn per cycle - 3.489298956 seconds time elapsed +TOTAL : 3.481603 sec + 10,608,834,284 cycles # 3.044 GHz + 24,023,421,035 instructions # 2.26 insn per cycle + 3.487384559 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.787875e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.125865e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.125865e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.756679e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.089717e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.089717e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.295291 sec - 6,588,895,934 cycles # 2.865 GHz - 12,413,954,044 instructions # 1.88 insn per cycle - 2.300926049 seconds time elapsed +TOTAL : 2.309669 sec + 6,605,241,660 cycles # 2.854 GHz + 12,414,642,119 instructions # 1.88 insn per cycle + 2.315374830 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3156) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.072251e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.445053e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.445053e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.883072e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.243446e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.243446e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.171777 sec - 6,238,931,665 cycles # 2.866 GHz - 11,585,660,605 instructions # 1.86 insn per cycle - 2.177410338 seconds time elapsed +TOTAL : 2.253913 sec + 6,256,146,881 cycles # 2.770 GHz + 11,588,754,266 instructions # 1.85 insn per cycle + 2.259602028 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2692) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.998110e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.229600e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.229600e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.014282e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.246391e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.246391e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.727363 sec - 5,337,713,756 cycles # 1.954 GHz - 9,308,309,205 instructions # 1.74 insn per cycle - 2.732896997 seconds time elapsed +TOTAL : 2.718420 sec + 5,340,176,505 cycles # 1.961 GHz + 9,309,276,244 instructions # 1.74 insn per cycle + 2.724177871 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2116) (512y: 282) (512z: 1958) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 64e33308d5..086ff92179 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:25:43 +DATE: 2023-11-08_21:42:00 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.571117e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.157677e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270835e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.601958e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157408e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.268312e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.523342 sec - 2,241,527,426 cycles # 2.944 GHz - 3,209,964,665 instructions # 1.43 insn per cycle - 0.819917937 seconds time elapsed +TOTAL : 0.523179 sec + 2,197,044,574 cycles # 2.904 GHz + 3,180,010,549 instructions # 1.45 insn per cycle + 0.813333970 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.658099e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.754988e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.754988e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.551621e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.643503e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.643503e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.049112 sec - 12,374,606,485 cycles # 3.053 GHz - 35,058,016,337 instructions # 2.83 insn per cycle - 4.054549094 seconds time elapsed +TOTAL : 4.216286 sec + 12,375,189,012 cycles # 2.932 GHz + 35,060,083,206 instructions # 2.83 insn per cycle + 4.222169031 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.088523e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.231607e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.231607e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.067813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.209694e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.209694e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.500477 sec - 10,694,410,777 cycles # 3.051 GHz - 23,099,336,289 instructions # 2.16 insn per cycle - 3.506159729 seconds time elapsed +TOTAL : 3.525507 sec + 10,698,056,208 cycles # 3.031 GHz + 23,100,081,560 instructions # 2.16 insn per cycle + 3.531306963 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.105721e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.492220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.492220e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.118146e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.507530e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.507530e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.158641 sec - 6,163,495,994 cycles # 2.849 GHz - 11,969,488,967 instructions # 1.94 insn per cycle - 2.164367762 seconds time elapsed +TOTAL : 2.154521 sec + 6,166,402,806 cycles # 2.856 GHz + 11,969,983,926 instructions # 1.94 insn per cycle + 2.160177772 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2511) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.169198e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.571659e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.571659e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.238236e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.649069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.649069e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.133549 sec - 6,039,094,179 cycles # 2.824 GHz - 11,144,077,781 instructions # 1.85 insn per cycle - 2.139096234 seconds time elapsed +TOTAL : 2.108281 sec + 6,026,300,401 cycles # 2.854 GHz + 11,141,738,024 instructions # 1.85 insn per cycle + 2.114031870 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2128) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.003701e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.233597e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.233597e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.978977e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.208595e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.208595e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.726476 sec - 5,224,063,612 cycles # 1.913 GHz - 9,034,702,359 instructions # 1.73 insn per cycle - 2.732050023 seconds time elapsed +TOTAL : 2.742076 sec + 5,240,960,370 cycles # 1.908 GHz + 9,033,887,762 instructions # 1.72 insn per cycle + 2.747795404 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1651) (512y: 208) (512z: 1567) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 8d92c550fe..eb4d5419ee 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:04:25 +DATE: 2023-11-08_21:19:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.099342e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.699387e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.953526e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.037656e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.679710e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.950060e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.471293 sec - 2,042,101,644 cycles # 2.948 GHz - 2,946,816,826 instructions # 1.44 insn per cycle - 0.749881107 seconds time elapsed +TOTAL : 0.474624 sec + 2,093,800,407 cycles # 2.948 GHz + 2,971,543,250 instructions # 1.42 insn per cycle + 0.767958808 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.296642e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.371475e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.371475e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.294584e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.370694e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.370694e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.650028 sec - 14,160,157,406 cycles # 3.043 GHz - 38,398,040,352 instructions # 2.71 insn per cycle - 4.655270250 seconds time elapsed +TOTAL : 4.654140 sec + 14,153,083,054 cycles # 3.038 GHz + 38,392,852,878 instructions # 2.71 insn per cycle + 4.659227784 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.139917e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.562152e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.562152e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.142013e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.564188e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.564188e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.124632 sec - 6,476,959,128 cycles # 3.042 GHz - 15,834,256,517 instructions # 2.44 insn per cycle - 2.129768462 seconds time elapsed +TOTAL : 2.123842 sec + 6,471,678,330 cycles # 3.041 GHz + 15,829,749,383 instructions # 2.45 insn per cycle + 2.129132115 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2689) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.088663e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.043198e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.043198e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.403745e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.082517e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.082517e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.237397 sec - 3,465,504,689 cycles # 2.794 GHz - 7,611,207,779 instructions # 2.20 insn per cycle - 1.242588855 seconds time elapsed +TOTAL : 1.198427 sec + 3,459,269,129 cycles # 2.876 GHz + 7,606,844,485 instructions # 2.20 insn per cycle + 1.203597878 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3051) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.457008e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096549e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.096549e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.005658e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168806e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.168806e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.196326 sec - 3,247,822,045 cycles # 2.704 GHz - 7,220,309,293 instructions # 2.22 insn per cycle - 1.201704693 seconds time elapsed +TOTAL : 1.126360 sec + 3,254,355,778 cycles # 2.878 GHz + 7,215,715,994 instructions # 2.22 insn per cycle + 1.131662200 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2850) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.679715e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.389169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.389169e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.276060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.101034e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.101034e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.658315 sec - 3,062,288,257 cycles # 1.842 GHz - 5,850,668,317 instructions # 1.91 insn per cycle - 1.663822965 seconds time elapsed +TOTAL : 1.528725 sec + 3,068,447,705 cycles # 2.001 GHz + 5,846,027,778 instructions # 1.91 insn per cycle + 1.534029615 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2364) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index a1ebef89d2..459315b5db 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:35:48 +DATE: 2023-11-08_21:52:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.064201e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.498245e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.498245e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.229057e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.759945e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.759945e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.670260 sec - 2,637,877,021 cycles # 2.942 GHz - 4,088,256,570 instructions # 1.55 insn per cycle - 0.955124097 seconds time elapsed +TOTAL : 0.663839 sec + 2,633,797,388 cycles # 2.963 GHz + 4,071,573,226 instructions # 1.55 insn per cycle + 0.947283739 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.270912e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.344925e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.344925e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.280486e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.353996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.353996e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.744982 sec - 14,378,860,027 cycles # 3.027 GHz - 38,435,472,086 instructions # 2.67 insn per cycle - 4.751370421 seconds time elapsed +TOTAL : 4.724775 sec + 14,342,143,211 cycles # 3.033 GHz + 38,438,250,053 instructions # 2.68 insn per cycle + 4.731136861 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.017460e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.422989e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.422989e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.072115e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.484269e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.484269e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.222354 sec - 6,685,137,863 cycles # 3.001 GHz - 16,109,819,565 instructions # 2.41 insn per cycle - 2.228696460 seconds time elapsed +TOTAL : 2.197377 sec + 6,673,460,854 cycles # 3.029 GHz + 16,110,044,412 instructions # 2.41 insn per cycle + 2.203637127 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2689) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.204872e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.057185e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.057185e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.156025e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.050843e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.050843e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.267912 sec - 3,665,496,802 cycles # 2.878 GHz - 7,843,464,752 instructions # 2.14 insn per cycle - 1.274414413 seconds time elapsed +TOTAL : 1.276703 sec + 3,679,224,682 cycles # 2.872 GHz + 7,844,733,298 instructions # 2.13 insn per cycle + 1.282950304 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3051) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.639653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.116975e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.116975e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.848037e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.141843e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.141843e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.220373 sec - 3,444,640,052 cycles # 2.810 GHz - 7,451,522,975 instructions # 2.16 insn per cycle - 1.226715796 seconds time elapsed +TOTAL : 1.194194 sec + 3,452,479,238 cycles # 2.878 GHz + 7,452,050,539 instructions # 2.16 insn per cycle + 1.200346156 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2850) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.178040e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.972638e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.972638e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.221197e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.012402e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.012402e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.593853 sec - 3,283,201,976 cycles # 2.053 GHz - 6,099,788,393 instructions # 1.86 insn per cycle - 1.600161746 seconds time elapsed +TOTAL : 1.583142 sec + 3,273,382,507 cycles # 2.061 GHz + 6,100,795,667 instructions # 1.86 insn per cycle + 1.589319377 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2364) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index b7fb0d6959..dcdda81950 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:48:49 +DATE: 2023-11-08_22:05:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.431152e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.624289e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.946132e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.826188e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.648877e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951378e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.564134 sec - 2,302,613,621 cycles # 2.942 GHz - 3,377,451,746 instructions # 1.47 insn per cycle - 0.841499880 seconds time elapsed +TOTAL : 0.557947 sec + 2,332,705,336 cycles # 3.000 GHz + 3,420,801,676 instructions # 1.47 insn per cycle + 0.836912289 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.289992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.364715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.364715e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.339471e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.416548e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.416548e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.718949 sec - 14,318,249,819 cycles # 3.032 GHz - 38,421,429,911 instructions # 2.68 insn per cycle - 4.724102129 seconds time elapsed +TOTAL : 4.618720 sec + 14,313,897,069 cycles # 3.097 GHz + 38,421,663,028 instructions # 2.68 insn per cycle + 4.623775275 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.077786e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.487595e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.487595e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.232630e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.661001e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.661001e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.204441 sec - 6,639,814,735 cycles # 3.006 GHz - 15,841,902,427 instructions # 2.39 insn per cycle - 2.209539727 seconds time elapsed +TOTAL : 2.140530 sec + 6,636,885,571 cycles # 3.094 GHz + 15,842,171,589 instructions # 2.39 insn per cycle + 2.145594820 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2689) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.307822e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.070999e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.070999e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.545031e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.097804e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.097804e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.265035 sec - 3,649,285,785 cycles # 2.875 GHz - 7,591,137,573 instructions # 2.08 insn per cycle - 1.270319196 seconds time elapsed +TOTAL : 1.233588 sec + 3,635,079,459 cycles # 2.936 GHz + 7,590,685,166 instructions # 2.09 insn per cycle + 1.238746125 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3051) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.974832e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.160037e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.160037e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.024875e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.195413e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.195413e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.191816 sec - 3,426,519,284 cycles # 2.864 GHz - 7,166,067,248 instructions # 2.09 insn per cycle - 1.197132868 seconds time elapsed +TOTAL : 1.160670 sec + 3,429,453,475 cycles # 2.944 GHz + 7,166,679,947 instructions # 2.09 insn per cycle + 1.165684786 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2850) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.265683e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.068951e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.068951e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.262300e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.049639e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.049639e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.584018 sec - 3,241,188,093 cycles # 2.041 GHz - 5,795,628,367 instructions # 1.79 insn per cycle - 1.589192883 seconds time elapsed +TOTAL : 1.582365 sec + 3,235,924,413 cycles # 2.039 GHz + 5,796,611,749 instructions # 1.79 insn per cycle + 1.587507042 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2364) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 30f4fadf92..831fd0fa9f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:45:28 +DATE: 2023-11-08_22:01:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.447666e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.634082e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.951326e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.837632e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.654775e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.958238e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.513708 sec - 2,149,338,807 cycles # 2.936 GHz - 3,363,855,189 instructions # 1.57 insn per cycle - 0.790810409 seconds time elapsed +TOTAL : 0.503341 sec + 2,173,332,424 cycles # 3.019 GHz + 3,385,289,251 instructions # 1.56 insn per cycle + 0.779359289 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.247364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.319306e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.319306e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.329232e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.405368e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.405368e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.751713 sec - 14,161,394,696 cycles # 2.978 GHz - 38,393,782,229 instructions # 2.71 insn per cycle - 4.756965371 seconds time elapsed +TOTAL : 4.586570 sec + 14,159,897,717 cycles # 3.085 GHz + 38,395,355,740 instructions # 2.71 insn per cycle + 4.591702989 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.102956e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.519127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.519127e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.170239e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.592491e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.592491e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.140784 sec - 6,476,072,518 cycles # 3.019 GHz - 15,828,662,766 instructions # 2.44 insn per cycle - 2.146087935 seconds time elapsed +TOTAL : 2.112173 sec + 6,472,075,786 cycles # 3.058 GHz + 15,829,638,315 instructions # 2.45 insn per cycle + 2.117221818 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2689) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.357298e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.077430e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.077430e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.605537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.104706e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.104706e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.205006 sec - 3,468,184,099 cycles # 2.868 GHz - 7,606,030,531 instructions # 2.19 insn per cycle - 1.210138102 seconds time elapsed +TOTAL : 1.174316 sec + 3,462,364,333 cycles # 2.937 GHz + 7,606,467,395 instructions # 2.20 insn per cycle + 1.179522425 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3051) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.559739e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.106426e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.106426e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.024286e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.190805e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.190805e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.182909 sec - 3,252,386,286 cycles # 2.739 GHz - 7,215,128,616 instructions # 2.22 insn per cycle - 1.188234183 seconds time elapsed +TOTAL : 1.105549 sec + 3,254,375,411 cycles # 2.932 GHz + 7,215,571,393 instructions # 2.22 insn per cycle + 1.110519445 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2850) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.332938e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.163555e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.163555e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.518662e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.361331e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.361331e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.514986 sec - 3,076,222,583 cycles # 2.024 GHz - 5,845,646,643 instructions # 1.90 insn per cycle - 1.520503790 seconds time elapsed +TOTAL : 1.478873 sec + 3,068,230,484 cycles # 2.069 GHz + 5,846,211,473 instructions # 1.91 insn per cycle + 1.484040601 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2364) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 65eed836f1..bb838a2196 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:42:11 +DATE: 2023-11-08_21:58:39 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.910755e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.623741e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.938668e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.130902e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.643491e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.939128e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.613295 sec - 2,456,965,302 cycles # 2.952 GHz - 3,803,211,416 instructions # 1.55 insn per cycle - 0.890835389 seconds time elapsed +TOTAL : 0.604908 sec + 2,484,417,262 cycles # 3.021 GHz + 3,852,149,899 instructions # 1.55 insn per cycle + 0.881326202 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.291712e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.365790e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.365790e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.328989e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.404078e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.404078e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.660750 sec - 14,151,818,953 cycles # 3.034 GHz - 38,392,284,342 instructions # 2.71 insn per cycle - 4.665929439 seconds time elapsed +TOTAL : 4.586292 sec + 14,210,336,618 cycles # 3.096 GHz + 38,392,847,533 instructions # 2.70 insn per cycle + 4.591549142 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.100691e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.531126e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.531126e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.239674e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.668279e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.668279e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.141262 sec - 6,484,613,456 cycles # 3.022 GHz - 15,829,197,800 instructions # 2.44 insn per cycle - 2.146554392 seconds time elapsed +TOTAL : 2.084661 sec + 6,470,762,281 cycles # 3.098 GHz + 15,829,570,536 instructions # 2.45 insn per cycle + 2.089664033 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2689) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.341999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.073892e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.073892e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.589227e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.103396e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.103396e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.207094 sec - 3,469,517,910 cycles # 2.864 GHz - 7,605,958,162 instructions # 2.19 insn per cycle - 1.212334488 seconds time elapsed +TOTAL : 1.175545 sec + 3,466,544,418 cycles # 2.938 GHz + 7,606,584,140 instructions # 2.19 insn per cycle + 1.180575347 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3051) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.000164e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.163047e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.163047e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.024662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.193480e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.193480e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.132933 sec - 3,264,238,503 cycles # 2.869 GHz - 7,214,964,009 instructions # 2.21 insn per cycle - 1.138315941 seconds time elapsed +TOTAL : 1.105660 sec + 3,258,740,690 cycles # 2.936 GHz + 7,215,101,525 instructions # 2.21 insn per cycle + 1.110765672 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2850) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.339791e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.166023e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.166023e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.584208e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.436586e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.436586e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.514355 sec - 3,071,490,694 cycles # 2.022 GHz - 5,845,279,944 instructions # 1.90 insn per cycle - 1.519539150 seconds time elapsed +TOTAL : 1.465958 sec + 3,064,168,908 cycles # 2.084 GHz + 5,845,466,179 instructions # 1.91 insn per cycle + 1.471139277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2364) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 06d8f7d09d..d667b6dbf4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:04:48 +DATE: 2023-11-08_21:19:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.108032e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.751852e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.017010e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.049999e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.742417e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.025106e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.473084 sec - 2,025,626,323 cycles # 2.920 GHz - 2,923,341,053 instructions # 1.44 insn per cycle - 0.752440698 seconds time elapsed +TOTAL : 0.475958 sec + 2,061,164,716 cycles # 2.907 GHz + 2,917,299,650 instructions # 1.42 insn per cycle + 0.766837667 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.226197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.296658e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.296658e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.217835e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.287538e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.287538e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.795639 sec - 14,422,319,778 cycles # 3.005 GHz - 39,889,404,210 instructions # 2.77 insn per cycle - 4.800761254 seconds time elapsed +TOTAL : 4.813699 sec + 14,428,562,676 cycles # 2.998 GHz + 39,888,508,384 instructions # 2.76 insn per cycle + 4.818824247 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.840353e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.410043e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.410043e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.957468e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.536679e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.536679e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.880181 sec - 5,610,891,745 cycles # 2.978 GHz - 15,305,908,167 instructions # 2.73 insn per cycle - 1.885354787 seconds time elapsed +TOTAL : 1.845039 sec + 5,590,599,138 cycles # 3.023 GHz + 15,299,534,426 instructions # 2.74 insn per cycle + 1.850198462 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2473) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.584020e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.270908e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.270908e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.651061e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.332537e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.332537e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.679496 sec - 4,739,407,479 cycles # 2.814 GHz - 9,752,382,085 instructions # 2.06 insn per cycle - 1.685063058 seconds time elapsed +TOTAL : 1.660892 sec + 4,740,556,619 cycles # 2.846 GHz + 9,747,822,441 instructions # 2.06 insn per cycle + 1.666191221 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3710) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.785300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.495008e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.495008e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.778515e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.494686e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.494686e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.630325 sec - 4,628,420,386 cycles # 2.831 GHz - 9,343,264,044 instructions # 2.02 insn per cycle - 1.635531127 seconds time elapsed +TOTAL : 1.631450 sec + 4,628,439,590 cycles # 2.829 GHz + 9,339,816,116 instructions # 2.02 insn per cycle + 1.636603727 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3497) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.035393e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.577354e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.577354e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.981004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.517698e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.517698e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.821625 sec - 3,652,061,133 cycles # 2.000 GHz - 7,049,331,376 instructions # 1.93 insn per cycle - 1.826875192 seconds time elapsed +TOTAL : 1.837853 sec + 3,663,588,168 cycles # 1.989 GHz + 7,045,799,249 instructions # 1.92 insn per cycle + 1.843187351 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2606) (512y: 12) (512z: 2221) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 430bbd2c8e..e94beeddac 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:26:11 +DATE: 2023-11-08_21:42:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.386931e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.620878e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.939459e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.362873e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.640443e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.957691e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478570 sec - 2,066,322,031 cycles # 2.937 GHz - 2,939,169,205 instructions # 1.42 insn per cycle - 0.760998289 seconds time elapsed +TOTAL : 0.478743 sec + 2,066,773,251 cycles # 2.940 GHz + 2,882,191,672 instructions # 1.39 insn per cycle + 0.760603829 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.585659e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.679951e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.679951e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.571240e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.665961e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.665961e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.141107 sec - 12,606,870,018 cycles # 3.041 GHz - 34,392,677,682 instructions # 2.73 insn per cycle - 4.146310630 seconds time elapsed +TOTAL : 4.163303 sec + 12,605,463,394 cycles # 3.025 GHz + 34,393,608,512 instructions # 2.73 insn per cycle + 4.168641817 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.476247e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.957210e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.957210e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.401759e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.886488e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.886488e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.000613 sec - 6,098,731,252 cycles # 3.041 GHz - 14,873,462,613 instructions # 2.44 insn per cycle - 2.006051106 seconds time elapsed +TOTAL : 2.027469 sec + 6,100,742,722 cycles # 3.002 GHz + 14,874,619,740 instructions # 2.44 insn per cycle + 2.032997684 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.182448e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.992665e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.992665e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.152588e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.984648e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.984648e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.544245 sec - 4,326,302,580 cycles # 2.793 GHz - 9,041,454,033 instructions # 2.09 insn per cycle - 1.549495391 seconds time elapsed +TOTAL : 1.570348 sec + 4,280,521,919 cycles # 2.743 GHz + 9,042,316,644 instructions # 2.11 insn per cycle + 1.575934676 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4445) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.602793e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.504278e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.504278e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.548985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.445828e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.445828e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.462983 sec - 4,209,847,303 cycles # 2.868 GHz - 8,675,528,842 instructions # 2.06 insn per cycle - 1.468300337 seconds time elapsed +TOTAL : 1.472831 sec + 4,206,089,473 cycles # 2.847 GHz + 8,677,889,358 instructions # 2.06 insn per cycle + 1.478375348 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4244) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.697162e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.177263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.177263e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.660562e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.137441e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.137441e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.925379 sec - 3,842,178,645 cycles # 1.991 GHz - 7,819,452,293 instructions # 2.04 insn per cycle - 1.930845155 seconds time elapsed +TOTAL : 1.938115 sec + 3,846,715,012 cycles # 1.980 GHz + 7,820,097,651 instructions # 2.03 insn per cycle + 1.943482590 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4420) (512y: 0) (512z: 2556) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index c32244c33c..a8a81cca05 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:26:34 +DATE: 2023-11-08_21:42:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.460575e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.684792e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.012555e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.468219e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.688670e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.018561e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478960 sec - 2,073,686,428 cycles # 2.952 GHz - 2,982,309,893 instructions # 1.44 insn per cycle - 0.760465246 seconds time elapsed +TOTAL : 0.479145 sec + 2,060,928,745 cycles # 2.937 GHz + 2,943,965,642 instructions # 1.43 insn per cycle + 0.760902085 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.768420e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.879887e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.879887e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.752408e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.860428e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.860428e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.874951 sec - 11,759,850,982 cycles # 3.031 GHz - 35,129,174,459 instructions # 2.99 insn per cycle - 3.880406297 seconds time elapsed +TOTAL : 3.895863 sec + 11,764,358,308 cycles # 3.017 GHz + 35,130,105,613 instructions # 2.99 insn per cycle + 3.901121829 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.548911e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.058975e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.058975e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.491671e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.980976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.980976e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.977553 sec - 5,960,287,184 cycles # 3.008 GHz - 14,484,169,544 instructions # 2.43 insn per cycle - 1.983134337 seconds time elapsed +TOTAL : 1.995272 sec + 5,963,721,442 cycles # 2.982 GHz + 14,483,479,258 instructions # 2.43 insn per cycle + 2.000909308 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.662372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.600563e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.600563e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.606859e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.529662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.529662e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.451994 sec - 4,186,509,528 cycles # 2.874 GHz - 8,887,826,504 instructions # 2.12 insn per cycle - 1.457581768 seconds time elapsed +TOTAL : 1.463863 sec + 4,171,268,875 cycles # 2.840 GHz + 8,887,248,415 instructions # 2.13 insn per cycle + 1.469508622 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3576) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.782199e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.721549e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.721549e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.334017e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.185528e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.185528e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.432127 sec - 4,128,776,992 cycles # 2.874 GHz - 8,424,271,434 instructions # 2.04 insn per cycle - 1.437420732 seconds time elapsed +TOTAL : 1.515911 sec + 4,141,896,373 cycles # 2.724 GHz + 8,425,434,947 instructions # 2.03 insn per cycle + 1.521361653 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3320) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.779314e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.273574e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.273574e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.735035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.250427e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.250427e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.899022 sec - 3,798,792,191 cycles # 1.996 GHz - 7,712,429,012 instructions # 2.03 insn per cycle - 1.904382082 seconds time elapsed +TOTAL : 1.913707 sec + 3,815,274,575 cycles # 1.989 GHz + 7,713,047,642 instructions # 2.02 insn per cycle + 1.919181973 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3436) (512y: 0) (512z: 2108) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 4284e04c80..1d637e1269 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:05:13 +DATE: 2023-11-08_21:20:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.262595e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.173145e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.266137e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.064819e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168761e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265943e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.516288 sec - 2,170,206,194 cycles # 2.914 GHz - 3,121,753,700 instructions # 1.44 insn per cycle - 0.802206987 seconds time elapsed +TOTAL : 0.516845 sec + 2,194,660,841 cycles # 2.941 GHz + 3,161,612,621 instructions # 1.44 insn per cycle + 0.804942538 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.129811e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.193121e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.193121e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.076007e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.135159e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.135159e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.027744 sec - 15,293,663,581 cycles # 3.040 GHz - 38,642,438,156 instructions # 2.53 insn per cycle - 5.032856601 seconds time elapsed +TOTAL : 5.157074 sec + 15,456,785,340 cycles # 2.995 GHz + 38,638,875,955 instructions # 2.50 insn per cycle + 5.162652658 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.666972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.869148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.869148e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.689929e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.902707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.902707e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.964411 sec - 8,933,093,188 cycles # 3.009 GHz - 24,243,353,502 instructions # 2.71 insn per cycle - 2.969821465 seconds time elapsed +TOTAL : 2.947066 sec + 8,960,192,906 cycles # 3.035 GHz + 24,239,204,206 instructions # 2.71 insn per cycle + 2.952599117 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.660709e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.167400e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.167400e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.870612e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.391820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.391820e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.961588 sec - 5,410,079,541 cycles # 2.752 GHz - 11,291,080,205 instructions # 2.09 insn per cycle - 1.966921243 seconds time elapsed +TOTAL : 1.891319 sec + 5,424,929,342 cycles # 2.862 GHz + 11,287,630,140 instructions # 2.08 insn per cycle + 1.896741262 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.588007e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.231756e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.231756e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.626799e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.289896e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.289896e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.695283 sec - 4,860,759,917 cycles # 2.859 GHz - 10,541,284,808 instructions # 2.17 insn per cycle - 1.700590360 seconds time elapsed +TOTAL : 1.686295 sec + 4,842,859,663 cycles # 2.864 GHz + 10,535,885,470 instructions # 2.18 insn per cycle + 1.691658185 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.107588e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.350535e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.350535e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.120532e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.365927e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.365927e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.656629 sec - 5,204,386,075 cycles # 1.956 GHz - 7,617,502,706 instructions # 1.46 insn per cycle - 2.661905103 seconds time elapsed +TOTAL : 2.650947 sec + 5,210,620,634 cycles # 1.962 GHz + 7,614,639,902 instructions # 1.46 insn per cycle + 2.656437650 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 58d2d743b0..92e3c9f0b5 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-03_19:05:40 +DATE: 2023-11-08_21:20:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.265506e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.176728e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270375e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.066522e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.173508e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273022e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.513169 sec - 2,175,922,923 cycles # 2.936 GHz - 3,154,957,492 instructions # 1.45 insn per cycle - 0.799013980 seconds time elapsed +TOTAL : 0.512769 sec + 2,197,876,209 cycles # 2.961 GHz + 3,170,940,757 instructions # 1.44 insn per cycle + 0.799563998 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.110999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.171227e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.171227e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.111886e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.172848e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.172848e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.072155 sec - 15,377,556,110 cycles # 3.029 GHz - 40,435,905,161 instructions # 2.63 insn per cycle - 5.077406066 seconds time elapsed +TOTAL : 5.069953 sec + 15,385,884,321 cycles # 3.032 GHz + 40,433,272,287 instructions # 2.63 insn per cycle + 5.075349465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.761885e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.974310e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.974310e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.654822e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.859127e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.859127e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.891901 sec - 8,516,736,770 cycles # 2.941 GHz - 23,273,421,536 instructions # 2.73 insn per cycle - 2.897134410 seconds time elapsed +TOTAL : 2.975229 sec + 8,506,893,399 cycles # 2.855 GHz + 23,270,886,855 instructions # 2.74 insn per cycle + 2.980696937 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.041812e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.416387e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.416387e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.053911e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.431363e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.431363e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.184891 sec - 6,239,964,038 cycles # 2.850 GHz - 12,976,938,369 instructions # 2.08 insn per cycle - 2.190210603 seconds time elapsed +TOTAL : 2.179721 sec + 6,241,572,834 cycles # 2.857 GHz + 12,973,482,438 instructions # 2.08 insn per cycle + 2.185137091 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.262419e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.673980e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.673980e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.331614e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.744905e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.744905e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.097286 sec - 5,931,604,060 cycles # 2.822 GHz - 12,254,844,972 instructions # 2.07 insn per cycle - 2.102596228 seconds time elapsed +TOTAL : 2.072194 sec + 5,929,542,555 cycles # 2.855 GHz + 12,251,825,862 instructions # 2.07 insn per cycle + 2.077717224 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.636806e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.830983e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.830983e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.800727e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.013912e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.013912e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.989274 sec - 5,599,763,733 cycles # 1.871 GHz - 8,758,209,944 instructions # 1.56 insn per cycle - 2.994808333 seconds time elapsed +TOTAL : 2.863923 sec + 5,611,513,288 cycles # 1.956 GHz + 8,753,901,381 instructions # 1.56 insn per cycle + 2.869313331 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index c973ded005..87df63c965 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-03_19:06:08 +DATE: 2023-11-08_21:21:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.987778e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.047089e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.059978e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.879738e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041736e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055795e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.462314 sec - 1,969,733,176 cycles # 2.915 GHz - 2,854,417,454 instructions # 1.45 insn per cycle - 0.732902295 seconds time elapsed +TOTAL : 0.461849 sec + 1,973,375,466 cycles # 2.915 GHz + 2,850,187,396 instructions # 1.44 insn per cycle + 0.733799311 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.125374e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.318187e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.329149e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.114902e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.320626e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.332328e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.595579 sec - 2,446,683,532 cycles # 2.952 GHz - 3,726,903,800 instructions # 1.52 insn per cycle - 0.888429467 seconds time elapsed +TOTAL : 0.597626 sec + 2,460,714,562 cycles # 2.956 GHz + 3,716,258,767 instructions # 1.51 insn per cycle + 0.892242937 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.543975e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.556543e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.556543e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.537254e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.549613e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.549613e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.463148 sec - 19,697,684,289 cycles # 3.046 GHz - 59,611,728,869 instructions # 3.03 insn per cycle - 6.467313414 seconds time elapsed +TOTAL : 6.480284 sec + 19,731,245,814 cycles # 3.044 GHz + 59,610,628,892 instructions # 3.02 insn per cycle + 6.484553626 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.806236e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.850408e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.850408e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.819525e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.864015e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.864015e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.430883 sec - 10,361,092,942 cycles # 3.017 GHz - 30,679,655,225 instructions # 2.96 insn per cycle - 3.435128458 seconds time elapsed +TOTAL : 3.421528 sec + 10,361,656,121 cycles # 3.025 GHz + 30,678,833,436 instructions # 2.96 insn per cycle + 3.425797412 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.723128e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.902993e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.902993e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.328413e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.498915e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.498915e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.707466 sec - 4,879,146,362 cycles # 2.851 GHz - 11,021,709,924 instructions # 2.26 insn per cycle - 1.711937944 seconds time elapsed +TOTAL : 1.779184 sec + 4,885,070,909 cycles # 2.740 GHz + 11,021,940,228 instructions # 2.26 insn per cycle + 1.783393950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.083664e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.105516e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.105516e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.089421e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111598e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111598e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.533989 sec - 4,371,523,225 cycles # 2.843 GHz - 10,299,869,041 instructions # 2.36 insn per cycle - 1.538284203 seconds time elapsed +TOTAL : 1.526514 sec + 4,365,565,996 cycles # 2.854 GHz + 10,298,805,774 instructions # 2.36 insn per cycle + 1.530732946 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.583252e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.691167e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.691167e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.324075e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.430754e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.430754e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.184881 sec - 4,101,268,943 cycles # 1.874 GHz - 5,846,549,953 instructions # 1.43 insn per cycle - 2.189162148 seconds time elapsed +TOTAL : 2.262206 sec + 4,104,673,936 cycles # 1.812 GHz + 5,846,278,322 instructions # 1.42 insn per cycle + 2.266456846 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index cc88ce6db1..a8aafca020 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-03_19:36:12 +DATE: 2023-11-08_21:52:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.617150e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.773641e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.773641e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.668584e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.838174e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.838174e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.490872 sec - 2,070,161,118 cycles # 2.946 GHz - 3,152,579,676 instructions # 1.52 insn per cycle - 0.759960652 seconds time elapsed +TOTAL : 0.491390 sec + 2,056,116,630 cycles # 2.930 GHz + 3,087,605,373 instructions # 1.50 insn per cycle + 0.760599439 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.687018e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.487518e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.487518e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.753470e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.636054e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.636054e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.832612 sec - 3,193,307,533 cycles # 2.947 GHz - 4,978,788,975 instructions # 1.56 insn per cycle - 1.143205796 seconds time elapsed +TOTAL : 0.817784 sec + 3,130,594,447 cycles # 2.944 GHz + 4,997,770,241 instructions # 1.60 insn per cycle + 1.126915791 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.529162e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.541866e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.541866e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.533314e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.546135e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.546135e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.507705 sec - 19,736,202,639 cycles # 3.031 GHz - 59,616,040,959 instructions # 3.02 insn per cycle - 6.512416242 seconds time elapsed +TOTAL : 6.496533 sec + 19,730,935,453 cycles # 3.036 GHz + 59,615,663,798 instructions # 3.02 insn per cycle + 6.500895427 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.815393e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.861165e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.861165e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.824473e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.869855e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.869855e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.431600 sec - 10,398,990,181 cycles # 3.027 GHz - 30,726,516,620 instructions # 2.95 insn per cycle - 3.436080496 seconds time elapsed +TOTAL : 3.425054 sec + 10,403,336,159 cycles # 3.035 GHz + 30,728,089,368 instructions # 2.95 insn per cycle + 3.429466512 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.253880e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.426870e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.426870e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.541398e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.724381e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.724381e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.802152 sec - 4,928,997,803 cycles # 2.730 GHz - 11,072,368,065 instructions # 2.25 insn per cycle - 1.806633331 seconds time elapsed +TOTAL : 1.747981 sec + 4,923,635,172 cycles # 2.811 GHz + 11,072,838,099 instructions # 2.25 insn per cycle + 1.752609449 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.076136e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.098656e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.098656e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.072827e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095239e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.095239e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.553423 sec - 4,411,400,335 cycles # 2.833 GHz - 10,349,798,385 instructions # 2.35 insn per cycle - 1.557941492 seconds time elapsed +TOTAL : 1.557290 sec + 4,408,906,008 cycles # 2.824 GHz + 10,349,337,234 instructions # 2.35 insn per cycle + 1.561766662 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.266833e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.375233e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.375233e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.462789e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.573036e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.573036e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.287929 sec - 4,148,582,308 cycles # 1.811 GHz - 5,885,924,420 instructions # 1.42 insn per cycle - 2.292472050 seconds time elapsed +TOTAL : 2.226828 sec + 4,140,433,235 cycles # 1.856 GHz + 5,883,947,133 instructions # 1.42 insn per cycle + 2.231231918 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 890a9e444f..2485d7fbb8 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-03_19:06:37 +DATE: 2023-11-08_21:21:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.934806e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.040123e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.052620e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.914793e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.044227e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057322e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.460430 sec - 1,973,324,046 cycles # 2.928 GHz - 2,840,856,751 instructions # 1.44 insn per cycle - 0.731489352 seconds time elapsed +TOTAL : 0.462395 sec + 2,001,608,406 cycles # 2.941 GHz + 2,866,642,977 instructions # 1.43 insn per cycle + 0.738112039 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.120884e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.312101e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.322916e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.109030e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.310930e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.322842e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.593653 sec - 2,438,307,110 cycles # 2.956 GHz - 3,770,815,852 instructions # 1.55 insn per cycle - 0.884294118 seconds time elapsed +TOTAL : 0.592309 sec + 2,454,004,684 cycles # 2.967 GHz + 3,701,468,710 instructions # 1.51 insn per cycle + 0.885901852 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.568377e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.581048e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.581048e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.546247e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.558939e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.558939e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.401933 sec - 19,482,758,220 cycles # 3.042 GHz - 58,802,978,389 instructions # 3.02 insn per cycle - 6.406140471 seconds time elapsed +TOTAL : 6.457597 sec + 19,573,619,879 cycles # 3.030 GHz + 58,802,481,580 instructions # 3.00 insn per cycle + 6.461777687 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.917983e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.963815e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.963815e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.793642e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.840400e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.840400e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.353380 sec - 10,239,214,469 cycles # 3.050 GHz - 30,351,045,797 instructions # 2.96 insn per cycle - 3.357673213 seconds time elapsed +TOTAL : 3.440445 sec + 10,252,301,234 cycles # 2.977 GHz + 30,351,085,669 instructions # 2.96 insn per cycle + 3.444877379 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.402320e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.570383e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.570383e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.384802e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.551869e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.551869e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.764710 sec - 5,042,998,580 cycles # 2.852 GHz - 11,486,615,235 instructions # 2.28 insn per cycle - 1.768978894 seconds time elapsed +TOTAL : 1.768254 sec + 5,044,938,195 cycles # 2.848 GHz + 11,486,596,301 instructions # 2.28 insn per cycle + 1.772428896 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.003860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.023445e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.023445e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.019018e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.038703e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.038703e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.654433 sec - 4,647,317,234 cycles # 2.803 GHz - 10,844,918,785 instructions # 2.33 insn per cycle - 1.658681615 seconds time elapsed +TOTAL : 1.630183 sec + 4,647,706,592 cycles # 2.845 GHz + 10,845,108,593 instructions # 2.33 insn per cycle + 1.634411362 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.419133e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.526721e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.526721e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.188773e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.290125e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.290125e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.233568 sec - 4,119,227,015 cycles # 1.842 GHz - 6,111,995,104 instructions # 1.48 insn per cycle - 2.238507475 seconds time elapsed +TOTAL : 2.304290 sec + 4,123,403,300 cycles # 1.794 GHz + 6,113,558,333 instructions # 1.48 insn per cycle + 2.308644720 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 906002ccef..0b448796b2 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-03_19:07:06 +DATE: 2023-11-08_21:22:02 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.570718e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.332431e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.423909e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.567286e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.376211e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.468457e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.445719 sec - 1,977,839,409 cycles # 2.946 GHz - 2,766,831,818 instructions # 1.40 insn per cycle - 0.728762524 seconds time elapsed +TOTAL : 0.444374 sec + 1,959,403,583 cycles # 2.932 GHz + 2,755,627,615 instructions # 1.41 insn per cycle + 0.725331091 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.444258e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.461256e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.527187e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.353667e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.408300e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.476909e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.490311 sec - 2,098,277,441 cycles # 2.940 GHz - 3,050,395,563 instructions # 1.45 insn per cycle - 0.771282830 seconds time elapsed +TOTAL : 0.490778 sec + 2,119,348,519 cycles # 2.946 GHz + 3,045,536,225 instructions # 1.44 insn per cycle + 0.776414109 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,9 +86,9 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 32,139,063 cycles # 2.763 GHz - 49,369,582 instructions # 1.54 insn per cycle - 0.012019390 seconds time elapsed + 31,825,625 cycles # 2.791 GHz + 48,514,379 instructions # 1.52 insn per cycle + 0.011782396 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index afa8c22c25..2f35cf010a 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-03_19:36:42 +DATE: 2023-11-08_21:53:11 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.935100e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.139273e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.139273e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.915722e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.200179e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.200179e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.458037 sec - 1,958,659,698 cycles # 2.936 GHz - 2,907,533,469 instructions # 1.48 insn per cycle - 0.726231579 seconds time elapsed +TOTAL : 0.459965 sec + 1,913,489,356 cycles # 2.854 GHz + 2,835,494,218 instructions # 1.48 insn per cycle + 0.728586503 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.639472e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.576828e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.576828e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.767536e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.641642e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.641642e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.638235 sec - 2,567,083,186 cycles # 2.951 GHz - 3,965,073,751 instructions # 1.54 insn per cycle - 0.927254467 seconds time elapsed +TOTAL : 0.634368 sec + 2,553,649,677 cycles # 2.951 GHz + 3,942,242,941 instructions # 1.54 insn per cycle + 0.922459199 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,9 +99,9 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) - 38,813,158 cycles # 2.791 GHz - 52,008,055 instructions # 1.34 insn per cycle - 0.014463641 seconds time elapsed + 38,286,300 cycles # 2.778 GHz + 51,959,635 instructions # 1.36 insn per cycle + 0.014194921 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index e0c37ae81b..e630fbc27d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-03_19:07:15 +DATE: 2023-11-08_21:22:11 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.552711e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.312060e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.409477e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.560442e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.377270e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.470091e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.443645 sec - 1,939,887,285 cycles # 2.958 GHz - 2,753,223,301 instructions # 1.42 insn per cycle - 0.713433638 seconds time elapsed +TOTAL : 0.443339 sec + 1,943,957,931 cycles # 2.944 GHz + 2,765,105,739 instructions # 1.42 insn per cycle + 0.717258208 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.420862e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.422248e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.487501e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.360432e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.412708e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.481720e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.489840 sec - 2,095,642,051 cycles # 2.944 GHz - 3,058,032,700 instructions # 1.46 insn per cycle - 0.771189239 seconds time elapsed +TOTAL : 0.491895 sec + 2,104,648,838 cycles # 2.938 GHz + 3,025,148,863 instructions # 1.44 insn per cycle + 0.773979442 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,9 +86,9 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 31,454,006 cycles # 2.782 GHz - 48,514,001 instructions # 1.54 insn per cycle - 0.011695448 seconds time elapsed + 31,662,761 cycles # 2.798 GHz + 47,511,797 instructions # 1.50 insn per cycle + 0.011712916 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 9bd85e98d0..e83376e827 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-03_19:07:25 +DATE: 2023-11-08_21:22:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.981637e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.050998e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.064107e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.888685e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.043488e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056349e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.460239 sec - 1,991,164,692 cycles # 2.956 GHz - 2,861,513,835 instructions # 1.44 insn per cycle - 0.731121053 seconds time elapsed +TOTAL : 0.461575 sec + 1,992,206,499 cycles # 2.947 GHz + 2,868,298,614 instructions # 1.44 insn per cycle + 0.733257197 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.125939e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.318916e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.329956e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.111138e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.315581e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.327177e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.595711 sec - 2,444,157,832 cycles # 2.957 GHz - 3,696,457,333 instructions # 1.51 insn per cycle - 0.888026518 seconds time elapsed +TOTAL : 0.598628 sec + 2,465,744,279 cycles # 2.958 GHz + 3,812,193,472 instructions # 1.55 insn per cycle + 0.893336251 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,9 +86,9 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 35,021,922 cycles # 2.756 GHz - 50,809,631 instructions # 1.45 insn per cycle - 0.013111359 seconds time elapsed + 34,711,490 cycles # 2.787 GHz + 50,039,456 instructions # 1.44 insn per cycle + 0.012986618 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 659836495f..ab62773e76 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-03_19:07:34 +DATE: 2023-11-08_21:22:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.948465e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.041856e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054410e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.840662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.037949e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.050999e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.460434 sec - 1,981,925,545 cycles # 2.941 GHz - 2,855,578,890 instructions # 1.44 insn per cycle - 0.731466835 seconds time elapsed +TOTAL : 0.462948 sec + 1,939,550,045 cycles # 2.866 GHz + 2,822,181,727 instructions # 1.46 insn per cycle + 0.733825753 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.114794e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.303596e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.314294e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.102587e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.303113e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.314475e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.592739 sec - 2,423,209,817 cycles # 2.940 GHz - 3,698,114,761 instructions # 1.53 insn per cycle - 0.885260737 seconds time elapsed +TOTAL : 0.591515 sec + 2,444,078,815 cycles # 2.952 GHz + 3,674,116,474 instructions # 1.50 insn per cycle + 0.887442466 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,9 +86,9 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 34,542,827 cycles # 2.778 GHz - 50,097,141 instructions # 1.45 insn per cycle - 0.012808089 seconds time elapsed + 34,181,769 cycles # 2.772 GHz + 49,201,973 instructions # 1.44 insn per cycle + 0.012846211 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index a9f9e7f9b0..0e571e2957 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:07:44 +DATE: 2023-11-08_21:22:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.471280e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.495513e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.497667e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.509565e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.535938e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.538049e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.521778 sec - 2,221,753,731 cycles # 2.953 GHz - 3,509,979,793 instructions # 1.58 insn per cycle - 0.811888374 seconds time elapsed +TOTAL : 0.522429 sec + 2,216,464,510 cycles # 2.948 GHz + 3,445,335,287 instructions # 1.55 insn per cycle + 0.813178007 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.130694e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.157314e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.158457e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.124490e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.152981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.154204e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.024926 sec - 9,877,023,451 cycles # 3.016 GHz - 20,938,621,148 instructions # 2.12 insn per cycle - 3.332222792 seconds time elapsed +TOTAL : 3.028693 sec + 9,700,865,704 cycles # 2.960 GHz + 20,299,179,534 instructions # 2.09 insn per cycle + 3.337900982 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.942881e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.943811e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.943811e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.948157e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949119e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949119e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.450914 sec - 25,661,004,969 cycles # 3.035 GHz - 78,943,064,293 instructions # 3.08 insn per cycle - 8.455241133 seconds time elapsed +TOTAL : 8.428390 sec + 25,658,286,461 cycles # 3.043 GHz + 78,943,496,553 instructions # 3.08 insn per cycle + 8.432674701 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.566286e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569647e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.569647e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.638426e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.641828e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.641828e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.607952 sec - 12,925,846,736 cycles # 2.803 GHz - 39,287,875,718 instructions # 3.04 insn per cycle - 4.612260028 seconds time elapsed +TOTAL : 4.516511 sec + 12,940,511,466 cycles # 2.863 GHz + 39,286,083,355 instructions # 3.04 insn per cycle + 4.520821646 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13182) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.376392e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.393376e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.393376e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.063000e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.079398e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.079398e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.967322 sec - 5,576,808,906 cycles # 2.829 GHz - 13,690,679,702 instructions # 2.45 insn per cycle - 1.971661788 seconds time elapsed +TOTAL : 2.043453 sec + 5,578,804,578 cycles # 2.725 GHz + 13,689,979,347 instructions # 2.45 insn per cycle + 2.047766279 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.568825e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.591271e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.591271e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.584845e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.608001e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.608001e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.723570 sec - 4,897,962,779 cycles # 2.836 GHz - 12,345,795,320 instructions # 2.52 insn per cycle - 1.727906957 seconds time elapsed +TOTAL : 1.720447 sec + 4,895,207,627 cycles # 2.839 GHz + 12,344,429,833 instructions # 2.52 insn per cycle + 1.724685286 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.463403e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.476893e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.476893e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.405020e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.418567e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.418567e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.207008 sec - 4,113,706,051 cycles # 1.861 GHz - 6,338,446,257 instructions # 1.54 insn per cycle - 2.211395304 seconds time elapsed +TOTAL : 2.224337 sec + 4,116,450,066 cycles # 1.848 GHz + 6,337,280,624 instructions # 1.54 insn per cycle + 2.228619766 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 05b9b7b471..6cfffac867 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:37:26 +DATE: 2023-11-08_21:53:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.138586e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.475297e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.475297e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.140206e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.481973e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.481973e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.514369 sec - 2,174,774,169 cycles # 2.935 GHz - 3,408,753,270 instructions # 1.57 insn per cycle - 0.802511668 seconds time elapsed +TOTAL : 0.512248 sec + 2,184,996,199 cycles # 2.952 GHz + 3,435,282,796 instructions # 1.57 insn per cycle + 0.800472589 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.635405e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.119639e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.119639e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.623195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.099384e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.099384e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.311178 sec - 10,730,531,324 cycles # 2.994 GHz - 24,179,707,994 instructions # 2.25 insn per cycle - 3.640277810 seconds time elapsed +TOTAL : 3.306442 sec + 10,620,771,247 cycles # 2.970 GHz + 24,014,706,294 instructions # 2.26 insn per cycle + 3.633696672 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.906612e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.907549e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.907549e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.935055e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.935984e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935984e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.615680 sec - 25,666,310,685 cycles # 2.978 GHz - 78,949,148,944 instructions # 3.08 insn per cycle - 8.620265583 seconds time elapsed +TOTAL : 8.489050 sec + 25,665,712,522 cycles # 3.023 GHz + 78,953,227,075 instructions # 3.08 insn per cycle + 8.493532453 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.685334e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.688850e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.688850e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.600578e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.604115e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.604115e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.463406 sec - 12,942,626,026 cycles # 2.897 GHz - 39,297,696,719 instructions # 3.04 insn per cycle - 4.468216686 seconds time elapsed +TOTAL : 4.569107 sec + 12,945,693,806 cycles # 2.831 GHz + 39,298,314,532 instructions # 3.04 insn per cycle + 4.573645709 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13182) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.403877e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.422097e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.422097e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.385455e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.402719e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.402719e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.965161 sec - 5,597,716,321 cycles # 2.843 GHz - 13,700,115,311 instructions # 2.45 insn per cycle - 1.969720229 seconds time elapsed +TOTAL : 1.969364 sec + 5,591,964,229 cycles # 2.834 GHz + 13,700,332,532 instructions # 2.45 insn per cycle + 1.973976640 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.573549e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.596918e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.596918e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.515181e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.538996e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.538996e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.726627 sec - 4,910,197,742 cycles # 2.838 GHz - 12,354,930,161 instructions # 2.52 insn per cycle - 1.731069519 seconds time elapsed +TOTAL : 1.736968 sec + 4,912,884,670 cycles # 2.825 GHz + 12,356,069,233 instructions # 2.52 insn per cycle + 1.741510676 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.408369e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.421923e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.421923e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.401693e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.415615e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.415615e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.227463 sec - 4,132,274,023 cycles # 1.852 GHz - 6,348,232,709 instructions # 1.54 insn per cycle - 2.231941444 seconds time elapsed +TOTAL : 2.229815 sec + 4,139,073,894 cycles # 1.853 GHz + 6,348,807,900 instructions # 1.53 insn per cycle + 2.234437952 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index d4a13c45dc..829db14182 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:49:13 +DATE: 2023-11-08_22:05:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.490628e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.519771e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.522013e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.498087e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.524326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.526521e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.506263 sec - 2,193,209,541 cycles # 2.934 GHz - 3,448,112,270 instructions # 1.57 insn per cycle - 0.811794626 seconds time elapsed +TOTAL : 0.505963 sec + 2,230,602,584 cycles # 2.998 GHz + 3,509,146,743 instructions # 1.57 insn per cycle + 0.814638005 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.140777e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.174961e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.176419e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.138629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.170285e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.171692e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.133332 sec - 10,144,803,574 cycles # 2.992 GHz - 22,979,164,856 instructions # 2.27 insn per cycle - 3.446699997 seconds time elapsed +TOTAL : 3.117149 sec + 10,263,257,910 cycles # 3.044 GHz + 22,984,843,224 instructions # 2.24 insn per cycle + 3.428387488 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.934897e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.935823e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.935823e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.955282e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.956235e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.956235e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.487397 sec - 25,642,144,633 cycles # 3.020 GHz - 78,942,503,354 instructions # 3.08 insn per cycle - 8.491509185 seconds time elapsed +TOTAL : 8.398336 sec + 25,654,945,707 cycles # 3.057 GHz + 78,946,836,924 instructions # 3.08 insn per cycle + 8.402318295 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.604711e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.608085e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.608085e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.739022e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742322e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742322e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.560510 sec - 12,949,935,406 cycles # 2.841 GHz - 39,287,959,625 instructions # 3.03 insn per cycle - 4.564590789 seconds time elapsed +TOTAL : 4.397110 sec + 12,932,706,473 cycles # 2.939 GHz + 39,284,078,298 instructions # 3.04 insn per cycle + 4.401176578 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13182) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.331820e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.349574e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.349574e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.547122e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.565515e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.565515e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.979581 sec - 5,585,242,942 cycles # 2.817 GHz - 13,688,645,923 instructions # 2.45 insn per cycle - 1.983846301 seconds time elapsed +TOTAL : 1.929747 sec + 5,584,587,761 cycles # 2.889 GHz + 13,688,784,163 instructions # 2.45 insn per cycle + 1.933938249 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.501909e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.523734e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.523734e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.712996e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.736353e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.736353e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.737131 sec - 4,904,473,574 cycles # 2.818 GHz - 12,343,066,066 instructions # 2.52 insn per cycle - 1.741373569 seconds time elapsed +TOTAL : 1.699524 sec + 4,899,825,358 cycles # 2.877 GHz + 12,342,496,756 instructions # 2.52 insn per cycle + 1.703963805 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.326865e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.339889e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.339889e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.584277e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.599062e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.599062e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.249568 sec - 4,122,823,033 cycles # 1.830 GHz - 6,335,244,526 instructions # 1.54 insn per cycle - 2.253741280 seconds time elapsed +TOTAL : 2.173644 sec + 4,127,419,767 cycles # 1.897 GHz + 6,336,272,499 instructions # 1.54 insn per cycle + 2.177878840 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 8a019b9732..35703491ac 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:45:52 +DATE: 2023-11-08_22:02:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.497991e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.525524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.527678e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.483209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.509549e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.511610e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.505150 sec - 2,198,803,568 cycles # 2.954 GHz - 3,469,496,289 instructions # 1.58 insn per cycle - 0.812740673 seconds time elapsed +TOTAL : 0.505331 sec + 2,237,004,452 cycles # 3.017 GHz + 3,469,560,739 instructions # 1.55 insn per cycle + 0.813831791 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.149366e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.183697e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.185208e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.137446e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.169549e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.170864e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.069067 sec - 9,961,450,693 cycles # 3.001 GHz - 22,775,488,914 instructions # 2.29 insn per cycle - 3.378594275 seconds time elapsed +TOTAL : 3.063844 sec + 10,025,654,279 cycles # 3.024 GHz + 22,437,691,349 instructions # 2.24 insn per cycle + 3.371428026 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.919154e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.920062e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.920062e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.972408e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.973332e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.973332e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.555088 sec - 25,630,164,257 cycles # 2.995 GHz - 78,942,698,347 instructions # 3.08 insn per cycle - 8.559388166 seconds time elapsed +TOTAL : 8.324018 sec + 25,644,049,472 cycles # 3.080 GHz + 78,945,889,994 instructions # 3.08 insn per cycle + 8.328093218 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.673575e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.677034e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.677034e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.757960e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.761409e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761409e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.474572 sec - 12,938,774,287 cycles # 2.890 GHz - 39,284,863,862 instructions # 3.04 insn per cycle - 4.478882140 seconds time elapsed +TOTAL : 4.373690 sec + 12,932,578,462 cycles # 2.955 GHz + 39,286,223,538 instructions # 3.04 insn per cycle + 4.377750469 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13182) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.365364e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.382422e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.382422e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.504027e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.521553e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.521553e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.970072 sec - 5,585,160,191 cycles # 2.830 GHz - 13,689,327,859 instructions # 2.45 insn per cycle - 1.974279626 seconds time elapsed +TOTAL : 1.937880 sec + 5,579,002,067 cycles # 2.875 GHz + 13,689,941,055 instructions # 2.45 insn per cycle + 1.941926119 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.573694e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.596726e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.596726e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.762551e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.785341e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.785341e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.722482 sec - 4,895,075,879 cycles # 2.836 GHz - 12,344,411,096 instructions # 2.52 insn per cycle - 1.726704102 seconds time elapsed +TOTAL : 1.689011 sec + 4,900,729,891 cycles # 2.896 GHz + 12,344,260,353 instructions # 2.52 insn per cycle + 1.693208802 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.342892e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.356180e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.356180e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.678242e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.692622e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.692622e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.243467 sec - 4,145,301,834 cycles # 1.845 GHz - 6,337,134,423 instructions # 1.53 insn per cycle - 2.247770943 seconds time elapsed +TOTAL : 2.144938 sec + 4,120,050,897 cycles # 1.918 GHz + 6,337,719,473 instructions # 1.54 insn per cycle + 2.149063218 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 0761c0d014..e3bb9b2d2b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:42:34 +DATE: 2023-11-08_21:59:02 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.224877e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.534029e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.536870e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.202444e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.496466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.498519e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.509685 sec - 2,194,677,974 cycles # 2.952 GHz - 3,468,699,947 instructions # 1.58 insn per cycle - 0.805522488 seconds time elapsed +TOTAL : 0.507729 sec + 2,224,053,547 cycles # 2.995 GHz + 3,511,447,697 instructions # 1.58 insn per cycle + 0.804264263 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.741528e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.176834e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178277e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.754243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.177673e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.179050e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.196085 sec - 10,332,938,289 cycles # 2.993 GHz - 23,233,171,839 instructions # 2.25 insn per cycle - 3.511259911 seconds time elapsed +TOTAL : 3.195418 sec + 10,560,218,824 cycles # 3.053 GHz + 23,272,224,469 instructions # 2.20 insn per cycle + 3.516017944 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,14 +94,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.927835e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.928807e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.928807e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.980652e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.981660e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.981660e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.516575 sec - 25,626,746,874 cycles # 3.008 GHz - 78,942,783,638 instructions # 3.08 insn per cycle - 8.520860421 seconds time elapsed +TOTAL : 8.289291 sec + 25,689,530,854 cycles # 3.098 GHz + 78,941,485,494 instructions # 3.07 insn per cycle + 8.293329329 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.674456e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.677849e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.677849e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.695812e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.699396e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.699396e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.472834 sec - 12,938,647,402 cycles # 2.891 GHz - 39,285,558,550 instructions # 3.04 insn per cycle - 4.477166946 seconds time elapsed +TOTAL : 4.446640 sec + 12,939,707,143 cycles # 2.908 GHz + 39,286,790,527 instructions # 3.04 insn per cycle + 4.450934428 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13182) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -148,14 +148,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.290335e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.307469e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.307469e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.540564e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.557570e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.557570e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.987857 sec - 5,582,015,296 cycles # 2.804 GHz - 13,690,066,849 instructions # 2.45 insn per cycle - 1.992149312 seconds time elapsed +TOTAL : 1.929715 sec + 5,584,326,574 cycles # 2.891 GHz + 13,690,307,414 instructions # 2.45 insn per cycle + 1.933841922 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -175,14 +175,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.537627e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.561759e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.561759e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.772043e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.794673e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.794673e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.729438 sec - 4,899,116,746 cycles # 2.827 GHz - 12,344,356,410 instructions # 2.52 insn per cycle - 1.733854664 seconds time elapsed +TOTAL : 1.687536 sec + 4,894,600,072 cycles # 2.895 GHz + 12,345,111,795 instructions # 2.52 insn per cycle + 1.691722733 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -202,14 +202,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.331605e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.345774e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.345774e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.667022e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.680748e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.680748e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.247519 sec - 4,126,377,191 cycles # 1.833 GHz - 6,337,288,668 instructions # 1.54 insn per cycle - 2.251874954 seconds time elapsed +TOTAL : 2.148381 sec + 4,119,534,680 cycles # 1.915 GHz + 6,337,066,991 instructions # 1.54 insn per cycle + 2.152520896 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index d519ec18af..2d6466a5d0 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:08:21 +DATE: 2023-11-08_21:23:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.482135e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.509267e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.511176e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.472415e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.497562e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.499582e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.519206 sec - 2,212,325,201 cycles # 2.954 GHz - 3,433,704,735 instructions # 1.55 insn per cycle - 0.807580904 seconds time elapsed +TOTAL : 0.522603 sec + 2,199,879,357 cycles # 2.926 GHz + 3,406,329,945 instructions # 1.55 insn per cycle + 0.812598895 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.159162e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.186085e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.187240e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.151978e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.180898e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.182114e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.004869 sec - 9,812,463,662 cycles # 3.013 GHz - 21,581,231,713 instructions # 2.20 insn per cycle - 3.312573877 seconds time elapsed +TOTAL : 3.012462 sec + 9,824,681,781 cycles # 3.013 GHz + 20,251,773,236 instructions # 2.06 insn per cycle + 3.320916673 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.947345e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.948277e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.948277e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.948786e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949722e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949722e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.431137 sec - 25,590,035,480 cycles # 3.034 GHz - 78,715,048,416 instructions # 3.08 insn per cycle - 8.435307792 seconds time elapsed +TOTAL : 8.425425 sec + 25,600,858,897 cycles # 3.038 GHz + 78,714,675,174 instructions # 3.07 insn per cycle + 8.429623210 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.620452e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.623805e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.623805e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.648721e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.652034e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.652034e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.539871 sec - 12,909,848,042 cycles # 2.843 GHz - 39,233,023,972 instructions # 3.04 insn per cycle - 4.544176080 seconds time elapsed +TOTAL : 4.503525 sec + 12,897,071,716 cycles # 2.862 GHz + 39,231,170,693 instructions # 3.04 insn per cycle + 4.507786711 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12949) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.331174e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.348654e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.348654e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.358235e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.375211e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.375211e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.977747 sec - 5,618,064,764 cycles # 2.836 GHz - 13,804,762,963 instructions # 2.46 insn per cycle - 1.981982814 seconds time elapsed +TOTAL : 1.971459 sec + 5,607,121,481 cycles # 2.839 GHz + 13,803,544,350 instructions # 2.46 insn per cycle + 1.975775051 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.463129e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.484771e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.484771e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.338508e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.360185e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.360185e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.742192 sec - 4,960,747,667 cycles # 2.842 GHz - 12,470,817,922 instructions # 2.51 insn per cycle - 1.746604551 seconds time elapsed +TOTAL : 1.768893 sec + 4,962,697,559 cycles # 2.805 GHz + 12,469,802,045 instructions # 2.51 insn per cycle + 1.786199910 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.427183e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.440655e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.440655e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.426426e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.440315e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.440315e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.217977 sec - 4,119,292,054 cycles # 1.855 GHz - 6,462,314,928 instructions # 1.57 insn per cycle - 2.222289185 seconds time elapsed +TOTAL : 2.218010 sec + 4,123,694,980 cycles # 1.856 GHz + 6,461,412,200 instructions # 1.57 insn per cycle + 2.222394946 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 0e734b6c9d..a4e352ee76 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:26:58 +DATE: 2023-11-08_21:43:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.237666e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.262462e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.264647e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.232524e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.256814e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.259170e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.533653 sec - 2,219,666,724 cycles # 2.910 GHz - 3,445,153,040 instructions # 1.55 insn per cycle - 0.821091738 seconds time elapsed +TOTAL : 0.534744 sec + 2,248,485,126 cycles # 2.941 GHz + 3,494,101,027 instructions # 1.55 insn per cycle + 0.823969121 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.775197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.803191e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.804422e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.777627e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.804807e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.805966e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.300230 sec - 10,634,484,052 cycles # 2.991 GHz - 23,844,861,281 instructions # 2.24 insn per cycle - 3.611693691 seconds time elapsed +TOTAL : 3.297104 sec + 10,673,501,263 cycles # 3.005 GHz + 24,226,094,920 instructions # 2.27 insn per cycle + 3.607615064 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.361422e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.361903e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.361903e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.346513e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.346993e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.346993e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.613350 sec - 113,653,626,732 cycles # 3.022 GHz - 144,966,182,806 instructions # 1.28 insn per cycle - 37.617592948 seconds time elapsed +TOTAL : 37.741985 sec + 113,582,106,901 cycles # 3.009 GHz + 144,968,769,114 instructions # 1.28 insn per cycle + 37.746219696 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:21605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.197160e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.199710e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.199710e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.143430e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.145919e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.145919e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.138561 sec - 14,751,525,638 cycles # 2.870 GHz - 37,578,516,323 instructions # 2.55 insn per cycle - 5.143061031 seconds time elapsed +TOTAL : 5.226537 sec + 14,726,949,716 cycles # 2.816 GHz + 37,578,521,140 instructions # 2.55 insn per cycle + 5.230978594 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68118) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.662015e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.676566e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.676566e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.619134e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.633428e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.633428e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.150367 sec - 6,125,090,080 cycles # 2.844 GHz - 13,063,740,704 instructions # 2.13 insn per cycle - 2.154679772 seconds time elapsed +TOTAL : 2.162328 sec + 6,132,958,052 cycles # 2.832 GHz + 13,063,746,182 instructions # 2.13 insn per cycle + 2.166766443 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.263953e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.285040e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.285040e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.242664e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.263271e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.263271e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.780016 sec - 5,060,160,878 cycles # 2.837 GHz - 11,442,229,361 instructions # 2.26 insn per cycle - 1.784487029 seconds time elapsed +TOTAL : 1.783918 sec + 5,064,574,027 cycles # 2.835 GHz + 11,442,541,397 instructions # 2.26 insn per cycle + 1.788276031 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.515689e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.530167e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.530167e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.693472e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.708550e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.708550e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.192230 sec - 3,982,582,654 cycles # 1.814 GHz - 5,943,874,364 instructions # 1.49 insn per cycle - 2.196624515 seconds time elapsed +TOTAL : 2.141610 sec + 3,984,341,945 cycles # 1.859 GHz + 5,944,587,769 instructions # 1.49 insn per cycle + 2.145941939 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index a431669edb..c9a3c0bc00 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:28:07 +DATE: 2023-11-08_21:44:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.227099e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.252215e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.254306e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.238547e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.263632e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.265593e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.530677 sec - 2,254,800,400 cycles # 2.956 GHz - 3,541,881,168 instructions # 1.57 insn per cycle - 0.819833622 seconds time elapsed +TOTAL : 0.528864 sec + 2,246,214,726 cycles # 2.961 GHz + 3,512,868,349 instructions # 1.56 insn per cycle + 0.816400547 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.792463e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.821318e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.822521e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.792504e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.819675e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.820783e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.276536 sec - 10,598,798,874 cycles # 3.001 GHz - 22,505,546,793 instructions # 2.12 insn per cycle - 3.590880872 seconds time elapsed +TOTAL : 3.270254 sec + 10,633,900,320 cycles # 3.014 GHz + 24,514,837,826 instructions # 2.31 insn per cycle + 3.584387558 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.316847e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.317310e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.317310e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.327617e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.328084e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.328084e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 38.002712 sec - 114,613,209,494 cycles # 3.016 GHz - 145,560,103,749 instructions # 1.27 insn per cycle - 38.007069023 seconds time elapsed +TOTAL : 37.906136 sec + 114,405,747,001 cycles # 3.018 GHz + 145,562,165,740 instructions # 1.27 insn per cycle + 37.910396057 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:22248) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.101440e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.103871e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.103871e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.120905e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.123383e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.123383e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.297737 sec - 15,180,958,119 cycles # 2.864 GHz - 37,765,704,407 instructions # 2.49 insn per cycle - 5.302092232 seconds time elapsed +TOTAL : 5.264434 sec + 15,164,870,179 cycles # 2.879 GHz + 37,765,103,372 instructions # 2.49 insn per cycle + 5.268658441 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68446) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.750289e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.764988e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.764988e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.815263e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.829969e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.829969e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.125646 sec - 6,006,519,083 cycles # 2.821 GHz - 12,897,926,690 instructions # 2.15 insn per cycle - 2.130039886 seconds time elapsed +TOTAL : 2.107998 sec + 6,006,546,140 cycles # 2.845 GHz + 12,898,448,008 instructions # 2.15 insn per cycle + 2.112261899 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.134516e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.155464e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.155464e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.170106e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.191645e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.191645e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.805195 sec - 5,111,264,978 cycles # 2.826 GHz - 11,448,660,091 instructions # 2.24 insn per cycle - 1.809562076 seconds time elapsed +TOTAL : 1.798019 sec + 5,110,595,937 cycles # 2.837 GHz + 11,448,746,145 instructions # 2.24 insn per cycle + 1.802331588 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.713307e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.727980e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.727980e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.719086e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.733849e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.733849e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.136153 sec - 3,956,606,945 cycles # 1.850 GHz - 5,898,384,643 instructions # 1.49 insn per cycle - 2.140545061 seconds time elapsed +TOTAL : 2.134583 sec + 3,969,461,110 cycles # 1.857 GHz + 5,897,831,571 instructions # 1.49 insn per cycle + 2.138816528 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 389fe370ef..9c1de01f16 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:08:57 +DATE: 2023-11-08_21:23:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.330449e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.375316e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.385679e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.293342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.339166e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.344348e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.478801 sec - 2,034,971,060 cycles # 2.940 GHz - 3,054,212,240 instructions # 1.50 insn per cycle - 0.749375620 seconds time elapsed +TOTAL : 0.481289 sec + 2,043,429,418 cycles # 2.945 GHz + 3,016,391,404 instructions # 1.48 insn per cycle + 0.753087040 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.529589e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.587136e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.589764e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.613713e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.676727e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.679629e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.723184 sec - 5,782,983,871 cycles # 2.964 GHz - 12,066,403,823 instructions # 2.09 insn per cycle - 2.008243733 seconds time elapsed +TOTAL : 1.713007 sec + 5,846,211,987 cycles # 2.997 GHz + 12,059,135,892 instructions # 2.06 insn per cycle + 2.007812305 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.003677e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.004662e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.004662e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.005115e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.006106e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.006106e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.193664 sec - 24,655,416,435 cycles # 3.008 GHz - 78,134,412,275 instructions # 3.17 insn per cycle - 8.197717930 seconds time elapsed +TOTAL : 8.187511 sec + 24,627,671,323 cycles # 3.007 GHz + 78,134,663,224 instructions # 3.17 insn per cycle + 8.191568767 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.270897e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.285143e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.285143e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.313136e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.326827e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.326827e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.263632 sec - 6,475,526,341 cycles # 2.856 GHz - 20,124,982,632 instructions # 3.11 insn per cycle - 2.267936828 seconds time elapsed +TOTAL : 2.250414 sec + 6,477,846,372 cycles # 2.874 GHz + 20,124,481,745 instructions # 3.11 insn per cycle + 2.254575609 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.655891e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.662862e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.662862e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.651750e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.658578e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.658578e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.998679 sec - 2,840,454,971 cycles # 2.834 GHz - 6,992,590,525 instructions # 2.46 insn per cycle - 1.002898964 seconds time elapsed +TOTAL : 1.000733 sec + 2,836,203,846 cycles # 2.824 GHz + 6,991,580,060 instructions # 2.47 insn per cycle + 1.005051926 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.904708e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.914180e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914180e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.891596e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.900607e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.900607e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.868982 sec - 2,491,374,231 cycles # 2.855 GHz - 6,299,681,276 instructions # 2.53 insn per cycle - 0.873227215 seconds time elapsed +TOTAL : 0.874979 sec + 2,489,876,695 cycles # 2.834 GHz + 6,298,919,091 instructions # 2.53 insn per cycle + 0.879145628 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.509691e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.515612e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.515612e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.492404e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.498044e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.498044e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.094413 sec - 2,048,957,877 cycles # 1.866 GHz - 3,269,073,408 instructions # 1.60 insn per cycle - 1.098654820 seconds time elapsed +TOTAL : 1.107211 sec + 2,056,905,721 cycles # 1.852 GHz + 3,268,863,177 instructions # 1.59 insn per cycle + 1.111361855 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2415) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 5a5ccf0962..7ef08eb1a1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:38:03 +DATE: 2023-11-08_21:54:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.621379e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.322960e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.322960e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.630785e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.310772e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.310772e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.467719 sec - 2,022,012,389 cycles # 2.930 GHz - 3,029,595,627 instructions # 1.50 insn per cycle - 0.748028952 seconds time elapsed +TOTAL : 0.466074 sec + 1,998,575,796 cycles # 2.933 GHz + 2,994,965,957 instructions # 1.50 insn per cycle + 0.738328183 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.232227e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.472561e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.472561e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.261662e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.481805e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.481805e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.900347 sec - 6,375,786,665 cycles # 2.982 GHz - 13,373,135,596 instructions # 2.10 insn per cycle - 2.195039568 seconds time elapsed +TOTAL : 1.889932 sec + 6,363,844,307 cycles # 2.984 GHz + 13,005,964,068 instructions # 2.04 insn per cycle + 2.191280597 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.008350e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.009347e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.009347e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.002381e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.003373e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003373e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.176665 sec - 24,649,325,474 cycles # 3.013 GHz - 78,138,045,806 instructions # 3.17 insn per cycle - 8.180908705 seconds time elapsed +TOTAL : 8.200661 sec + 24,662,776,052 cycles # 3.006 GHz + 78,138,608,532 instructions # 3.17 insn per cycle + 8.204934256 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.326247e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.339746e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.339746e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.306848e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.320652e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.320652e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.249404 sec - 6,483,421,678 cycles # 2.878 GHz - 20,133,640,820 instructions # 3.11 insn per cycle - 2.253658931 seconds time elapsed +TOTAL : 2.255006 sec + 6,482,848,456 cycles # 2.870 GHz + 20,133,573,977 instructions # 3.11 insn per cycle + 2.259320427 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.657895e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.664866e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.664866e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.648854e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.655690e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.655690e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.999874 sec - 2,846,897,865 cycles # 2.837 GHz - 7,001,448,108 instructions # 2.46 insn per cycle - 1.004235579 seconds time elapsed +TOTAL : 1.005313 sec + 2,849,286,060 cycles # 2.824 GHz + 7,001,856,779 instructions # 2.46 insn per cycle + 1.009712120 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.899947e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909346e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909346e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.888498e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.898036e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.898036e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.873710 sec - 2,498,501,131 cycles # 2.848 GHz - 6,308,536,459 instructions # 2.52 insn per cycle - 0.877964105 seconds time elapsed +TOTAL : 0.879137 sec + 2,499,075,063 cycles # 2.831 GHz + 6,309,019,763 instructions # 2.52 insn per cycle + 0.883537991 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.494285e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.499863e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.499863e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.493195e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.498802e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.498802e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.108704 sec - 2,059,473,334 cycles # 1.852 GHz - 3,279,338,884 instructions # 1.59 insn per cycle - 1.113120539 seconds time elapsed +TOTAL : 1.109448 sec + 2,060,050,205 cycles # 1.851 GHz + 3,279,571,633 instructions # 1.59 insn per cycle + 1.113744599 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2415) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 12ad22d5a3..4d664fc4d6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:49:50 +DATE: 2023-11-08_22:06:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.340393e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.392051e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.397944e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.355118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.403966e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.409182e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.462195 sec - 1,986,930,742 cycles # 2.947 GHz - 3,005,964,493 instructions # 1.51 insn per cycle - 0.730831332 seconds time elapsed +TOTAL : 0.462753 sec + 2,014,223,981 cycles # 2.997 GHz + 3,038,538,632 instructions # 1.51 insn per cycle + 0.729935758 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.547500e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.620827e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.624055e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.565713e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.634653e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.637720e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.798053 sec - 6,062,916,752 cycles # 2.993 GHz - 11,569,516,184 instructions # 1.91 insn per cycle - 2.082278895 seconds time elapsed +TOTAL : 1.796992 sec + 6,172,975,790 cycles # 3.046 GHz + 13,083,554,495 instructions # 2.12 insn per cycle + 2.086223865 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.005661e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.006690e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.006690e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.045430e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.046401e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.046401e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.186459 sec - 24,671,953,454 cycles # 3.013 GHz - 78,137,621,710 instructions # 3.17 insn per cycle - 8.190517160 seconds time elapsed +TOTAL : 8.027601 sec + 24,633,930,216 cycles # 3.068 GHz + 78,134,736,063 instructions # 3.17 insn per cycle + 8.031555788 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.107458e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.120841e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.120841e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.461058e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.474893e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.474893e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.317106 sec - 6,488,771,451 cycles # 2.796 GHz - 20,124,539,496 instructions # 3.10 insn per cycle - 2.321142527 seconds time elapsed +TOTAL : 2.206755 sec + 6,481,821,994 cycles # 2.933 GHz + 20,123,351,594 instructions # 3.10 insn per cycle + 2.210721958 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.647793e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.654673e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.654673e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.665888e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.672800e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.672800e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.005506 sec - 2,843,966,049 cycles # 2.818 GHz - 6,991,496,346 instructions # 2.46 insn per cycle - 1.009548479 seconds time elapsed +TOTAL : 0.994258 sec + 2,841,630,041 cycles # 2.848 GHz + 6,990,811,149 instructions # 2.46 insn per cycle + 0.998209890 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.895349e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.904605e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904605e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.891296e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.900721e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.900721e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.875040 sec - 2,495,845,822 cycles # 2.841 GHz - 6,297,369,404 instructions # 2.52 insn per cycle - 0.879134455 seconds time elapsed +TOTAL : 0.876906 sec + 2,495,700,726 cycles # 2.835 GHz + 6,297,076,618 instructions # 2.52 insn per cycle + 0.880948978 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.504042e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.510113e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.510113e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.552086e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558027e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558027e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.099941 sec - 2,050,409,457 cycles # 1.858 GHz - 3,265,015,309 instructions # 1.59 insn per cycle - 1.104007255 seconds time elapsed +TOTAL : 1.065622 sec + 2,049,379,894 cycles # 1.917 GHz + 3,265,032,857 instructions # 1.59 insn per cycle + 1.069477010 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2415) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 5b13ff9774..ee315233c1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:46:29 +DATE: 2023-11-08_22:02:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.339869e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.391844e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.397472e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.328542e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.377951e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.383103e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.461224 sec - 1,973,907,563 cycles # 2.940 GHz - 2,969,869,707 instructions # 1.50 insn per cycle - 0.729741448 seconds time elapsed +TOTAL : 0.460444 sec + 2,025,388,470 cycles # 3.016 GHz + 3,026,490,924 instructions # 1.49 insn per cycle + 0.728886791 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.563612e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.637504e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.640751e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.561347e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.630349e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.633332e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.749654 sec - 5,928,680,592 cycles # 2.999 GHz - 12,893,930,524 instructions # 2.17 insn per cycle - 2.033490620 seconds time elapsed +TOTAL : 1.742876 sec + 6,025,656,195 cycles # 3.063 GHz + 13,153,972,386 instructions # 2.18 insn per cycle + 2.023922647 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.014770e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.015759e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.015759e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.049881e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.050905e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.050905e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.147957 sec - 24,635,567,678 cycles # 3.022 GHz - 78,133,891,626 instructions # 3.17 insn per cycle - 8.152140443 seconds time elapsed +TOTAL : 8.008804 sec + 24,622,379,845 cycles # 3.073 GHz + 78,134,077,156 instructions # 3.17 insn per cycle + 8.012721206 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.062428e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.074909e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.074909e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.445321e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.458917e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.458917e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.330015 sec - 6,475,827,642 cycles # 2.775 GHz - 20,124,634,132 instructions # 3.11 insn per cycle - 2.334037311 seconds time elapsed +TOTAL : 2.210782 sec + 6,475,852,782 cycles # 2.925 GHz + 20,124,175,553 instructions # 3.11 insn per cycle + 2.214842110 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.595519e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.602006e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.602006e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.697514e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.704851e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.704851e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.036160 sec - 2,838,919,957 cycles # 2.730 GHz - 6,991,694,320 instructions # 2.46 insn per cycle - 1.040335460 seconds time elapsed +TOTAL : 0.973836 sec + 2,835,149,001 cycles # 2.901 GHz + 6,991,410,852 instructions # 2.47 insn per cycle + 0.977864307 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.893954e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.903085e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.903085e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.934817e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.944385e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.944385e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.873924 sec - 2,489,283,092 cycles # 2.837 GHz - 6,298,948,511 instructions # 2.53 insn per cycle - 0.878050091 seconds time elapsed +TOTAL : 0.855140 sec + 2,487,419,693 cycles # 2.897 GHz + 6,298,706,089 instructions # 2.53 insn per cycle + 0.859052723 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.497242e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.502884e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.502884e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.555511e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.561377e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.561377e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.103482 sec - 2,049,248,209 cycles # 1.852 GHz - 3,268,952,113 instructions # 1.60 insn per cycle - 1.107551558 seconds time elapsed +TOTAL : 1.062258 sec + 2,048,558,209 cycles # 1.923 GHz + 3,268,764,234 instructions # 1.60 insn per cycle + 1.066272803 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2415) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index cdb252ac3a..efdbcfe1ae 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:43:11 +DATE: 2023-11-08_21:59:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.764175e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.406414e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.411755e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.758974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.368878e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.373916e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.466281 sec - 1,989,064,547 cycles # 2.930 GHz - 3,017,212,928 instructions # 1.52 insn per cycle - 0.737783039 seconds time elapsed +TOTAL : 0.462598 sec + 2,002,083,704 cycles # 2.975 GHz + 3,028,559,110 instructions # 1.51 insn per cycle + 0.730010364 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.472408e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.626435e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.629621e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.506677e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.634226e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.637242e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.825252 sec - 6,129,357,136 cycles # 2.985 GHz - 13,024,512,874 instructions # 2.12 insn per cycle - 2.110041533 seconds time elapsed +TOTAL : 1.818599 sec + 6,254,092,117 cycles # 3.058 GHz + 12,631,559,563 instructions # 2.02 insn per cycle + 2.110653596 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,14 +94,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.017146e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.018188e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.018188e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.065897e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.066912e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.066912e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.138680 sec - 24,636,857,889 cycles # 3.027 GHz - 78,136,646,989 instructions # 3.17 insn per cycle - 8.142807331 seconds time elapsed +TOTAL : 7.946634 sec + 24,618,185,681 cycles # 3.097 GHz + 78,133,594,453 instructions # 3.17 insn per cycle + 7.950536612 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.266088e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.280126e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.280126e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.469422e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.483642e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.483642e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.265043 sec - 6,477,387,096 cycles # 2.855 GHz - 20,124,193,083 instructions # 3.11 insn per cycle - 2.269259910 seconds time elapsed +TOTAL : 2.203521 sec + 6,477,304,059 cycles # 2.935 GHz + 20,124,231,259 instructions # 3.11 insn per cycle + 2.207560981 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -148,14 +148,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.644884e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.651718e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.651718e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.692268e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.699231e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.699231e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.005131 sec - 2,839,448,871 cycles # 2.816 GHz - 6,991,884,623 instructions # 2.46 insn per cycle - 1.009345557 seconds time elapsed +TOTAL : 0.976738 sec + 2,836,504,426 cycles # 2.894 GHz + 6,991,415,909 instructions # 2.46 insn per cycle + 0.980720950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -175,14 +175,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.866159e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.874920e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.874920e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.804638e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.812886e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.812886e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.887083 sec - 2,489,977,422 cycles # 2.796 GHz - 6,298,695,060 instructions # 2.53 insn per cycle - 0.891225776 seconds time elapsed +TOTAL : 0.917702 sec + 2,493,684,467 cycles # 2.707 GHz + 6,299,926,195 instructions # 2.53 insn per cycle + 0.922017124 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -202,14 +202,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.498745e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.504407e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.504407e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.542695e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.548647e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548647e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.102380 sec - 2,046,697,565 cycles # 1.851 GHz - 3,268,682,926 instructions # 1.60 insn per cycle - 1.106464577 seconds time elapsed +TOTAL : 1.070949 sec + 2,049,167,689 cycles # 1.907 GHz + 3,268,610,487 instructions # 1.60 insn per cycle + 1.074921168 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2415) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 9fe77f3bb4..afc8dc6250 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:09:27 +DATE: 2023-11-08_21:24:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.327293e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.373619e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.378917e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.334864e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.384627e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.390200e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.480093 sec - 2,046,992,398 cycles # 2.957 GHz - 3,008,261,627 instructions # 1.47 insn per cycle - 0.750577809 seconds time elapsed +TOTAL : 0.478485 sec + 2,037,012,252 cycles # 2.938 GHz + 3,030,553,414 instructions # 1.49 insn per cycle + 0.751162438 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.515177e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.572348e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.574911e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.576633e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.638822e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.641657e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.715757 sec - 5,871,373,370 cycles # 3.006 GHz - 12,204,738,560 instructions # 2.08 insn per cycle - 2.009775672 seconds time elapsed +TOTAL : 1.723660 sec + 5,841,021,027 cycles # 2.992 GHz + 11,140,232,262 instructions # 1.91 insn per cycle + 2.010396879 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.026797e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.027818e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.027818e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.020250e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.021294e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.021294e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.100053 sec - 24,563,227,881 cycles # 3.031 GHz - 77,860,200,084 instructions # 3.17 insn per cycle - 8.104232064 seconds time elapsed +TOTAL : 8.126388 sec + 24,531,986,763 cycles # 3.018 GHz + 77,860,700,825 instructions # 3.17 insn per cycle + 8.130365170 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3113) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.430084e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.444359e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.444359e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.508420e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.523945e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.523945e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.215968 sec - 6,421,588,621 cycles # 2.894 GHz - 20,090,220,099 instructions # 3.13 insn per cycle - 2.220335001 seconds time elapsed +TOTAL : 2.192196 sec + 6,417,749,314 cycles # 2.923 GHz + 20,089,444,717 instructions # 3.13 insn per cycle + 2.196603069 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.625861e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.632520e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.632520e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.619246e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.625936e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.625936e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.016598 sec - 2,906,571,537 cycles # 2.849 GHz - 7,134,546,428 instructions # 2.45 insn per cycle - 1.020819368 seconds time elapsed +TOTAL : 1.020667 sec + 2,904,857,639 cycles # 2.836 GHz + 7,133,491,112 instructions # 2.46 insn per cycle + 1.024733034 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.810175e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.818358e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.818358e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.807219e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.815471e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.815471e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.914087 sec - 2,595,791,217 cycles # 2.828 GHz - 6,442,852,611 instructions # 2.48 insn per cycle - 0.918452804 seconds time elapsed +TOTAL : 0.915311 sec + 2,597,440,177 cycles # 2.827 GHz + 6,442,073,160 instructions # 2.48 insn per cycle + 0.919440444 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.453251e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.458727e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.458727e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.330502e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.335014e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.335014e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.136514 sec - 2,124,554,510 cycles # 1.864 GHz - 3,431,456,558 instructions # 1.62 insn per cycle - 1.140688320 seconds time elapsed +TOTAL : 1.241025 sec + 2,122,770,451 cycles # 1.706 GHz + 3,430,866,539 instructions # 1.62 insn per cycle + 1.245371552 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2912) (512y: 22) (512z: 9647) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 6d22eac4d2..86542f0b70 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:29:17 +DATE: 2023-11-08_21:45:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.584275e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.627587e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.631963e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.570490e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.610069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.614296e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.489018 sec - 2,081,067,674 cycles # 2.934 GHz - 3,133,776,802 instructions # 1.51 insn per cycle - 0.771988427 seconds time elapsed +TOTAL : 0.491125 sec + 2,098,886,797 cycles # 2.948 GHz + 3,121,764,784 instructions # 1.49 insn per cycle + 0.773983413 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.747350e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.808169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.810857e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.716470e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.775515e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.778049e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.853996 sec - 6,275,481,753 cycles # 3.001 GHz - 12,514,155,894 instructions # 1.99 insn per cycle - 2.147936222 seconds time elapsed +TOTAL : 1.856510 sec + 6,241,842,396 cycles # 2.982 GHz + 13,362,161,836 instructions # 2.14 insn per cycle + 2.150637345 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.644036e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.644860e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.644860e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.736455e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.737287e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.737287e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.065327 sec - 87,424,924,787 cycles # 3.008 GHz - 135,567,300,472 instructions # 1.55 insn per cycle - 29.069446346 seconds time elapsed +TOTAL : 28.600113 sec + 86,425,718,035 cycles # 3.022 GHz + 135,574,556,258 instructions # 1.57 insn per cycle + 28.604413837 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:15486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.026233e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.038857e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.038857e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.030289e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.043211e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.043211e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.342565 sec - 6,786,587,363 cycles # 2.893 GHz - 19,387,387,931 instructions # 2.86 insn per cycle - 2.346831164 seconds time elapsed +TOTAL : 2.341197 sec + 6,779,953,097 cycles # 2.892 GHz + 19,387,529,866 instructions # 2.86 insn per cycle + 2.345543121 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69680) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.459444e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.464900e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.464900e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.479111e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.484786e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.484786e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.132478 sec - 3,179,013,562 cycles # 2.798 GHz - 6,809,043,401 instructions # 2.14 insn per cycle - 1.136902959 seconds time elapsed +TOTAL : 1.117197 sec + 3,179,595,887 cycles # 2.837 GHz + 6,808,760,792 instructions # 2.14 insn per cycle + 1.121370768 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.738168e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.745907e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.745907e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.783416e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.791440e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.791440e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.952016 sec - 2,651,392,730 cycles # 2.774 GHz - 5,987,188,755 instructions # 2.26 insn per cycle - 0.956397839 seconds time elapsed +TOTAL : 0.927417 sec + 2,649,120,857 cycles # 2.846 GHz + 5,987,099,017 instructions # 2.26 insn per cycle + 0.931540821 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.472802e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.478184e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.478184e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.490502e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.495988e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.495988e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.121995 sec - 2,073,738,270 cycles # 1.843 GHz - 3,501,511,021 instructions # 1.69 insn per cycle - 1.126283052 seconds time elapsed +TOTAL : 1.108557 sec + 2,075,562,698 cycles # 1.867 GHz + 3,501,563,321 instructions # 1.69 insn per cycle + 1.112823809 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5198) (512y: 3) (512z:44822) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 5c9ad24a46..4737cdf8e3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:30:09 +DATE: 2023-11-08_21:46:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.558233e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.598421e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.603327e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.528505e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.572699e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.577185e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.487345 sec - 2,076,063,570 cycles # 2.928 GHz - 3,124,474,063 instructions # 1.50 insn per cycle - 0.769324674 seconds time elapsed +TOTAL : 0.485528 sec + 2,086,161,680 cycles # 2.950 GHz + 3,149,356,396 instructions # 1.51 insn per cycle + 0.766853446 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.647182e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.706650e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.709351e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.640879e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.699452e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.702171e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.865548 sec - 6,314,327,402 cycles # 2.992 GHz - 13,540,816,282 instructions # 2.14 insn per cycle - 2.170188129 seconds time elapsed +TOTAL : 1.863196 sec + 6,301,645,470 cycles # 3.002 GHz + 12,163,417,933 instructions # 1.93 insn per cycle + 2.157068829 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.736423e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.737265e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.737265e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.763152e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.763992e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.763992e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.597065 sec - 86,035,998,776 cycles # 3.009 GHz - 135,911,265,736 instructions # 1.58 insn per cycle - 28.601145029 seconds time elapsed +TOTAL : 28.466782 sec + 86,160,161,464 cycles # 3.027 GHz + 135,907,402,983 instructions # 1.58 insn per cycle + 28.470931551 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:15910) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.976771e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.989628e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.989628e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.954712e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.967174e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.967174e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.358802 sec - 6,848,676,061 cycles # 2.899 GHz - 19,439,456,701 instructions # 2.84 insn per cycle - 2.362995374 seconds time elapsed +TOTAL : 2.366132 sec + 6,848,483,827 cycles # 2.890 GHz + 19,440,750,063 instructions # 2.84 insn per cycle + 2.370332980 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69722) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.510619e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.516450e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.516450e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.511072e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.516863e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.516863e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.093889 sec - 3,110,977,160 cycles # 2.835 GHz - 6,719,869,092 instructions # 2.16 insn per cycle - 1.098127483 seconds time elapsed +TOTAL : 1.093285 sec + 3,106,954,835 cycles # 2.833 GHz + 6,720,019,206 instructions # 2.16 insn per cycle + 1.097556495 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.794946e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.802956e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.802956e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.791720e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799978e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799978e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.922821 sec - 2,627,235,427 cycles # 2.838 GHz - 5,970,250,488 instructions # 2.27 insn per cycle - 0.926978795 seconds time elapsed +TOTAL : 0.924560 sec + 2,625,881,689 cycles # 2.831 GHz + 5,970,468,600 instructions # 2.27 insn per cycle + 0.928699193 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.483560e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.489106e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.489106e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.485772e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.491338e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.491338e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.114177 sec - 2,080,137,201 cycles # 1.861 GHz - 3,494,948,543 instructions # 1.68 insn per cycle - 1.118521627 seconds time elapsed +TOTAL : 1.112143 sec + 2,079,682,688 cycles # 1.864 GHz + 3,494,926,799 instructions # 1.68 insn per cycle + 1.116310984 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4162) (512y: 4) (512z:44465) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index b38c13fcd9..0d88057431 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:09:56 +DATE: 2023-11-08_21:24:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.468828e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.491770e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.493892e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.461953e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.486921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.488984e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.519876 sec - 2,215,127,737 cycles # 2.957 GHz - 3,487,212,374 instructions # 1.57 insn per cycle - 0.807913712 seconds time elapsed +TOTAL : 0.524274 sec + 2,213,988,684 cycles # 2.939 GHz + 3,460,274,141 instructions # 1.56 insn per cycle + 0.814878779 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.135164e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.161799e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.162966e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.131317e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.159899e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.161114e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.028303 sec - 9,769,796,186 cycles # 2.979 GHz - 22,335,132,843 instructions # 2.29 insn per cycle - 3.336784998 seconds time elapsed +TOTAL : 3.024560 sec + 9,783,019,983 cycles # 2.986 GHz + 21,052,355,005 instructions # 2.15 insn per cycle + 3.333798384 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.912244e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.913140e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.913140e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.908400e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.909295e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.909295e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.586099 sec - 25,914,180,302 cycles # 3.017 GHz - 79,445,505,152 instructions # 3.07 insn per cycle - 8.590406292 seconds time elapsed +TOTAL : 8.603127 sec + 25,922,951,314 cycles # 3.012 GHz + 79,444,287,848 instructions # 3.06 insn per cycle + 8.607377110 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4857) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.695684e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.699049e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.699049e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.601676e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.605199e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.605199e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.447189 sec - 12,656,450,439 cycles # 2.844 GHz - 38,554,825,829 instructions # 3.05 insn per cycle - 4.451478069 seconds time elapsed +TOTAL : 4.563626 sec + 12,670,494,381 cycles # 2.774 GHz + 38,555,115,428 instructions # 3.04 insn per cycle + 4.567958025 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.537952e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.556620e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.556620e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.436133e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.453065e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.453065e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.930375 sec - 5,512,214,802 cycles # 2.850 GHz - 13,486,265,307 instructions # 2.45 insn per cycle - 1.934770358 seconds time elapsed +TOTAL : 1.953575 sec + 5,515,640,809 cycles # 2.818 GHz + 13,484,131,277 instructions # 2.44 insn per cycle + 1.957940467 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.638550e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.660856e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.660856e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.530089e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.553433e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.553433e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.711054 sec - 4,872,445,248 cycles # 2.842 GHz - 12,141,983,198 instructions # 2.49 insn per cycle - 1.715434660 seconds time elapsed +TOTAL : 1.730211 sec + 4,882,100,767 cycles # 2.816 GHz + 12,140,913,078 instructions # 2.49 insn per cycle + 1.734496344 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.406789e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.420159e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.420159e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.332978e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.346275e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.346275e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.223975 sec - 4,144,217,356 cycles # 1.862 GHz - 6,340,578,545 instructions # 1.53 insn per cycle - 2.228285470 seconds time elapsed +TOTAL : 2.246181 sec + 4,144,338,295 cycles # 1.842 GHz + 6,339,235,304 instructions # 1.53 insn per cycle + 2.250536993 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1802) (512y: 93) (512z: 9358) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 46f37c0a90..154c33870f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-03_19:10:33 +DATE: 2023-11-08_21:25:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.484364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.507714e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.509764e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.466139e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.491413e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.493568e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.519968 sec - 2,216,873,411 cycles # 2.952 GHz - 3,459,675,597 instructions # 1.56 insn per cycle - 0.809738739 seconds time elapsed +TOTAL : 0.523033 sec + 2,231,792,351 cycles # 2.947 GHz + 3,493,743,246 instructions # 1.57 insn per cycle + 0.817222718 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.161246e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.162402e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.134865e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.163582e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.164827e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.016486 sec - 9,822,814,204 cycles # 3.004 GHz - 22,339,986,571 instructions # 2.27 insn per cycle - 3.325238208 seconds time elapsed +TOTAL : 3.022684 sec + 9,525,982,822 cycles # 2.907 GHz + 21,759,904,749 instructions # 2.28 insn per cycle + 3.333718015 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.909809e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.910727e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.910727e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.890125e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.891036e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.891036e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.597381 sec - 25,939,435,501 cycles # 3.017 GHz - 79,457,351,519 instructions # 3.06 insn per cycle - 8.601657625 seconds time elapsed +TOTAL : 8.687196 sec + 25,936,497,205 cycles # 2.985 GHz + 79,455,431,598 instructions # 3.06 insn per cycle + 8.691442955 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4504) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.664829e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.668218e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.668218e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.674580e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.678053e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.678053e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.484461 sec - 12,651,418,370 cycles # 2.819 GHz - 38,525,727,884 instructions # 3.05 insn per cycle - 4.488762135 seconds time elapsed +TOTAL : 4.473580 sec + 12,663,684,829 cycles # 2.829 GHz + 38,526,072,859 instructions # 3.04 insn per cycle + 4.477928329 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12928) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.385701e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.404187e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.404187e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.447225e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.464376e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.464376e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.965077 sec - 5,557,225,506 cycles # 2.823 GHz - 13,610,780,927 instructions # 2.45 insn per cycle - 1.969439061 seconds time elapsed +TOTAL : 1.950551 sec + 5,554,043,311 cycles # 2.842 GHz + 13,609,444,575 instructions # 2.45 insn per cycle + 1.954818500 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.328216e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.349743e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.349743e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.528912e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.551046e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.551046e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.767465 sec - 4,920,931,185 cycles # 2.779 GHz - 12,278,542,674 instructions # 2.50 insn per cycle - 1.771926617 seconds time elapsed +TOTAL : 1.730043 sec + 4,918,299,350 cycles # 2.837 GHz + 12,276,281,852 instructions # 2.50 insn per cycle + 1.734286887 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.389874e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.403004e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.403004e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.227160e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.239598e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.239598e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.228912 sec - 4,146,930,402 cycles # 1.858 GHz - 6,446,453,346 instructions # 1.55 insn per cycle - 2.233245374 seconds time elapsed +TOTAL : 2.278650 sec + 4,148,690,065 cycles # 1.818 GHz + 6,446,007,726 instructions # 1.55 insn per cycle + 2.282996103 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1627) (512y: 191) (512z: 9356) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 2048a9698e..f7c4424904 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-03_19:12:52 +DATE: 2023-11-08_21:27:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.071850e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.072225e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.072335e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.070515e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.070905e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.071008e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.421447 sec - 8,245,731,454 cycles # 3.012 GHz - 18,688,279,165 instructions # 2.27 insn per cycle - 2.797097094 seconds time elapsed +TOTAL : 2.420963 sec + 8,223,258,722 cycles # 3.000 GHz + 17,670,197,130 instructions # 2.15 insn per cycle + 2.797812392 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.261920e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.263777e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.264034e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.267469e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.269461e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.269740e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.993277 sec - 12,924,149,664 cycles # 2.993 GHz - 29,920,520,122 instructions # 2.32 insn per cycle - 4.373104302 seconds time elapsed +TOTAL : 3.983357 sec + 12,890,762,548 cycles # 2.986 GHz + 28,149,713,448 instructions # 2.18 insn per cycle + 4.374511500 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.414546e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.414780e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.414780e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.327736e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.327962e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.327962e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.278557 sec - 18,784,400,880 cycles # 2.990 GHz - 53,915,743,321 instructions # 2.87 insn per cycle - 6.282578284 seconds time elapsed +TOTAL : 6.345395 sec + 18,808,426,055 cycles # 2.963 GHz + 53,915,859,593 instructions # 2.87 insn per cycle + 6.349306785 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.622225e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.622313e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.622313e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.631387e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.631477e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.631477e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.260349 sec - 9,843,353,366 cycles # 3.016 GHz - 27,093,120,012 instructions # 2.75 insn per cycle - 3.264542212 seconds time elapsed +TOTAL : 3.247242 sec + 9,798,431,936 cycles # 3.015 GHz + 27,093,078,884 instructions # 2.77 insn per cycle + 3.251306892 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96441) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.543297e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.543763e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.543763e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.527269e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.527671e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.527671e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.494911 sec - 4,247,565,583 cycles # 2.835 GHz - 9,561,660,282 instructions # 2.25 insn per cycle - 1.498994646 seconds time elapsed +TOTAL : 1.502062 sec + 4,254,510,227 cycles # 2.826 GHz + 9,561,365,042 instructions # 2.25 insn per cycle + 1.506086006 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.041064e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.041630e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.041630e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.044745e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.045315e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.045315e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.312043 sec - 3,711,873,932 cycles # 2.822 GHz - 8,485,580,977 instructions # 2.29 insn per cycle - 1.316064551 seconds time elapsed +TOTAL : 1.310362 sec + 3,714,842,589 cycles # 2.828 GHz + 8,485,417,237 instructions # 2.28 insn per cycle + 1.314439582 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.655846e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.656376e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.656376e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.650927e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.651448e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.651448e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.450066 sec - 2,692,078,825 cycles # 1.852 GHz - 4,273,245,565 instructions # 1.59 insn per cycle - 1.454158841 seconds time elapsed +TOTAL : 1.452786 sec + 2,695,403,304 cycles # 1.852 GHz + 4,273,125,151 instructions # 1.59 insn per cycle + 1.456779010 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index fbbae31086..f73b319e4d 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-03_19:38:33 +DATE: 2023-11-08_21:55:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.071334e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.072304e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.072304e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.070004e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.071005e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.071005e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.376547 sec - 8,066,576,997 cycles # 2.992 GHz - 17,224,378,863 instructions # 2.14 insn per cycle - 2.753340167 seconds time elapsed +TOTAL : 2.374046 sec + 8,061,206,235 cycles # 2.993 GHz + 17,860,181,288 instructions # 2.22 insn per cycle + 2.750065172 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.219956e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.252584e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.252584e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.226901e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.259810e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.259810e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.983566 sec - 12,755,700,095 cycles # 2.969 GHz - 26,780,853,821 instructions # 2.10 insn per cycle - 4.362214203 seconds time elapsed +TOTAL : 3.996223 sec + 12,903,615,719 cycles # 2.989 GHz + 27,064,646,353 instructions # 2.10 insn per cycle + 4.375939404 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.520548e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.520796e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.520796e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.320809e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.321082e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.321082e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.203566 sec - 18,829,459,475 cycles # 3.034 GHz - 53,915,868,697 instructions # 2.86 insn per cycle - 6.207586404 seconds time elapsed +TOTAL : 6.351015 sec + 18,895,432,596 cycles # 2.975 GHz + 53,920,363,469 instructions # 2.85 insn per cycle + 6.355030283 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.632618e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.632708e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.632708e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.632581e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.632679e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.632679e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.239187 sec - 9,805,468,555 cycles # 3.024 GHz - 27,094,086,958 instructions # 2.76 insn per cycle - 3.243245202 seconds time elapsed +TOTAL : 3.239771 sec + 9,805,010,159 cycles # 3.023 GHz + 27,094,031,310 instructions # 2.76 insn per cycle + 3.243901475 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96441) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.541893e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.542348e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.542348e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.542776e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.543249e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.543249e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.496042 sec - 4,247,154,617 cycles # 2.833 GHz - 9,562,315,517 instructions # 2.25 insn per cycle - 1.500165545 seconds time elapsed +TOTAL : 1.496243 sec + 4,233,173,830 cycles # 2.823 GHz + 9,562,510,318 instructions # 2.26 insn per cycle + 1.500255263 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.062512e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.063083e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.063083e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.008828e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.009454e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.009454e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.305609 sec - 3,707,362,205 cycles # 2.832 GHz - 8,486,374,508 instructions # 2.29 insn per cycle - 1.309600698 seconds time elapsed +TOTAL : 1.322886 sec + 3,744,251,192 cycles # 2.823 GHz + 8,486,441,130 instructions # 2.27 insn per cycle + 1.326937869 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.623189e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.623772e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.623772e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.594601e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.595186e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.595186e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.463361 sec - 2,697,367,089 cycles # 1.839 GHz - 4,274,143,132 instructions # 1.58 insn per cycle - 1.467446249 seconds time elapsed +TOTAL : 1.474018 sec + 2,696,155,761 cycles # 1.825 GHz + 4,274,155,931 instructions # 1.59 insn per cycle + 1.478064357 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index c51993cada..7a2b2c0da9 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-03_19:13:56 +DATE: 2023-11-08_21:28:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.063023e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.063394e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.063534e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.069743e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.070108e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.070239e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.424921 sec - 8,232,683,158 cycles # 2.990 GHz - 17,655,317,796 instructions # 2.14 insn per cycle - 2.812107310 seconds time elapsed +TOTAL : 2.423143 sec + 8,082,723,641 cycles # 2.933 GHz + 18,147,438,278 instructions # 2.25 insn per cycle + 2.812330272 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.268141e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.269954e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.270195e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.271955e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.273887e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.274124e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.993040 sec - 12,961,046,511 cycles # 3.002 GHz - 29,041,240,897 instructions # 2.24 insn per cycle - 4.374135451 seconds time elapsed +TOTAL : 3.988843 sec + 13,001,327,656 cycles # 3.014 GHz + 27,551,753,777 instructions # 2.12 insn per cycle + 4.370037996 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.423791e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.424026e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.424026e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.093188e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.093423e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.093423e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.281065 sec - 18,737,351,960 cycles # 2.982 GHz - 53,924,990,961 instructions # 2.88 insn per cycle - 6.285160496 seconds time elapsed +TOTAL : 6.521355 sec + 18,798,207,330 cycles # 2.882 GHz + 53,926,908,182 instructions # 2.87 insn per cycle + 6.525452544 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.617244e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.617330e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.617330e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.629486e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.629575e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.629575e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.274940 sec - 9,810,206,221 cycles # 2.993 GHz - 27,090,315,670 instructions # 2.76 insn per cycle - 3.279033724 seconds time elapsed +TOTAL : 3.245853 sec + 9,848,079,716 cycles # 3.031 GHz + 27,090,265,030 instructions # 2.75 insn per cycle + 3.250037477 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96284) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.504500e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.504945e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.504945e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.490286e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.490752e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.490752e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.511756 sec - 4,249,692,377 cycles # 2.805 GHz - 9,561,658,782 instructions # 2.25 insn per cycle - 1.515796071 seconds time elapsed +TOTAL : 1.517008 sec + 4,257,648,545 cycles # 2.800 GHz + 9,561,344,255 instructions # 2.25 insn per cycle + 1.521285854 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.067567e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.068141e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.068141e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.021344e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.021901e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.021901e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.304248 sec - 3,697,935,435 cycles # 2.828 GHz - 8,485,512,243 instructions # 2.29 insn per cycle - 1.308302011 seconds time elapsed +TOTAL : 1.319133 sec + 3,701,318,743 cycles # 2.798 GHz + 8,485,189,781 instructions # 2.29 insn per cycle + 1.323286884 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.626044e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.626572e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.626572e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.378941e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.379448e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.379448e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.462511 sec - 2,704,261,685 cycles # 1.846 GHz - 4,277,565,036 instructions # 1.58 insn per cycle - 1.466688212 seconds time elapsed +TOTAL : 1.566088 sec + 2,698,066,709 cycles # 1.719 GHz + 4,276,879,461 instructions # 1.59 insn per cycle + 1.570153625 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 0a60ba6d62..f4e838f103 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-03_19:14:59 +DATE: 2023-11-08_21:29:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.757584e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.758488e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.758845e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.755384e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.756376e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.756775e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.659896 sec - 5,702,631,198 cycles # 2.947 GHz - 11,810,983,379 instructions # 2.07 insn per cycle - 1.991424837 seconds time elapsed +TOTAL : 1.659165 sec + 5,717,115,891 cycles # 2.955 GHz + 12,190,075,892 instructions # 2.13 insn per cycle + 1.991284959 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.332515e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.333177e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.333265e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.328819e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.329492e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.329584e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.929687 sec - 6,546,483,377 cycles # 2.952 GHz - 14,155,312,120 instructions # 2.16 insn per cycle - 2.273547514 seconds time elapsed +TOTAL : 1.928671 sec + 6,641,955,934 cycles # 3.003 GHz + 14,330,947,638 instructions # 2.16 insn per cycle + 2.270510678 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.817807e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.818080e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.818080e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.903818e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.904090e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.904090e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.991502 sec - 17,897,297,418 cycles # 2.986 GHz - 53,590,305,749 instructions # 2.99 insn per cycle - 5.995609214 seconds time elapsed +TOTAL : 5.935151 sec + 17,988,960,616 cycles # 3.029 GHz + 53,590,161,611 instructions # 2.98 insn per cycle + 5.939109392 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.535145e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.535592e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.535592e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.520103e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.520628e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520628e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.499015 sec - 4,559,682,745 cycles # 3.035 GHz - 13,762,791,022 instructions # 3.02 insn per cycle - 1.503172123 seconds time elapsed +TOTAL : 1.505890 sec + 4,563,568,647 cycles # 3.024 GHz + 13,762,453,321 instructions # 3.02 insn per cycle + 1.509910484 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.101340e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.103065e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.103065e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.038019e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.039763e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.039763e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.748885 sec - 2,136,693,329 cycles # 2.841 GHz - 4,817,082,222 instructions # 2.25 insn per cycle - 0.752876610 seconds time elapsed +TOTAL : 0.756034 sec + 2,141,156,270 cycles # 2.820 GHz + 4,816,859,984 instructions # 2.25 insn per cycle + 0.760083736 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.112158e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.114365e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.114365e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.079503e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.081743e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.081743e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.656308 sec - 1,869,942,366 cycles # 2.835 GHz - 4,274,318,244 instructions # 2.29 insn per cycle - 0.660301551 seconds time elapsed +TOTAL : 0.658883 sec + 1,871,387,054 cycles # 2.825 GHz + 4,273,792,692 instructions # 2.28 insn per cycle + 0.663026186 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.296564e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.298817e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.298817e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.037980e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.040224e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.040224e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.728798 sec - 1,352,736,555 cycles # 1.847 GHz - 2,158,877,197 instructions # 1.60 insn per cycle - 0.732817833 seconds time elapsed +TOTAL : 0.756823 sec + 1,355,166,582 cycles # 1.782 GHz + 2,158,764,056 instructions # 1.59 insn per cycle + 0.760952708 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2878) (512y: 49) (512z:79298) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 17034b30a2..6fa929f5b1 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-03_19:39:36 +DATE: 2023-11-08_21:56:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.806522e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.808414e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.808414e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.804869e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.806749e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.806749e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.595844 sec - 5,598,060,641 cycles # 2.994 GHz - 11,899,085,664 instructions # 2.13 insn per cycle - 1.927316991 seconds time elapsed +TOTAL : 1.602285 sec + 5,612,741,884 cycles # 2.994 GHz + 11,823,721,041 instructions # 2.11 insn per cycle + 1.932057655 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.306726e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.320071e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.320071e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.321250e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.334433e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.334433e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.896426 sec - 6,482,998,335 cycles # 2.990 GHz - 13,087,346,923 instructions # 2.02 insn per cycle - 2.228516012 seconds time elapsed +TOTAL : 1.875073 sec + 6,423,111,015 cycles # 2.987 GHz + 14,218,262,182 instructions # 2.21 insn per cycle + 2.206850504 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.982697e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.982966e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.982966e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.905231e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.905509e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.905509e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.882902 sec - 17,886,003,642 cycles # 3.039 GHz - 53,589,820,489 instructions # 3.00 insn per cycle - 5.886864227 seconds time elapsed +TOTAL : 5.938687 sec + 17,836,764,476 cycles # 3.002 GHz + 53,590,153,759 instructions # 3.00 insn per cycle + 5.942639449 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.517559e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.518006e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.518006e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.489420e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489830e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.489830e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.508545 sec - 4,560,262,414 cycles # 3.016 GHz - 13,763,353,615 instructions # 3.02 insn per cycle - 1.512732617 seconds time elapsed +TOTAL : 1.517580 sec + 4,611,683,817 cycles # 3.032 GHz + 13,763,345,896 instructions # 2.98 insn per cycle + 1.521625428 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.047943e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.049624e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.049624e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.247085e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.248950e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.248950e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.755133 sec - 2,153,006,129 cycles # 2.839 GHz - 4,818,213,561 instructions # 2.24 insn per cycle - 0.759225829 seconds time elapsed +TOTAL : 0.733666 sec + 2,134,815,435 cycles # 2.897 GHz + 4,817,815,542 instructions # 2.26 insn per cycle + 0.737580401 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.134004e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.136209e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.136209e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.255023e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.257521e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.257521e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.654556 sec - 1,870,329,136 cycles # 2.842 GHz - 4,274,869,931 instructions # 2.29 insn per cycle - 0.658687365 seconds time elapsed +TOTAL : 0.644323 sec + 1,868,915,722 cycles # 2.886 GHz + 4,274,871,857 instructions # 2.29 insn per cycle + 0.648325497 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.265196e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.267580e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.267580e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.514603e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.516833e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.516833e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.732395 sec - 1,354,970,411 cycles # 1.842 GHz - 2,159,667,135 instructions # 1.59 insn per cycle - 0.736399157 seconds time elapsed +TOTAL : 0.708018 sec + 1,353,648,095 cycles # 1.903 GHz + 2,159,618,866 instructions # 1.60 insn per cycle + 0.711901071 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2878) (512y: 49) (512z:79298) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 9247dc6a21..2b69abf3e0 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-03_19:15:46 +DATE: 2023-11-08_21:30:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.757824e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.758656e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.758919e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.751553e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.752429e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.752778e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.658123 sec - 5,805,953,943 cycles # 3.008 GHz - 12,018,291,784 instructions # 2.07 insn per cycle - 1.988767308 seconds time elapsed +TOTAL : 1.662264 sec + 5,791,514,994 cycles # 2.989 GHz + 11,290,505,064 instructions # 1.95 insn per cycle + 1.994487544 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.327280e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.327957e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.328041e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.318654e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.319320e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319463e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.929152 sec - 6,666,976,802 cycles # 3.013 GHz - 13,831,721,664 instructions # 2.07 insn per cycle - 2.269150647 seconds time elapsed +TOTAL : 1.936834 sec + 6,513,518,428 cycles # 2.942 GHz + 13,310,876,477 instructions # 2.04 insn per cycle + 2.270995377 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.798758e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.799028e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.799028e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.877357e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.877629e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.877629e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.002749 sec - 17,897,748,334 cycles # 2.981 GHz - 53,583,210,251 instructions # 2.99 insn per cycle - 6.006727820 seconds time elapsed +TOTAL : 5.953085 sec + 17,926,444,710 cycles # 3.010 GHz + 53,580,674,845 instructions # 2.99 insn per cycle + 5.957045253 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:20206) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.533102e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.533527e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.533527e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.538806e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.539230e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.539230e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.498905 sec - 4,550,573,846 cycles # 3.029 GHz - 13,756,139,320 instructions # 3.02 insn per cycle - 1.503009468 seconds time elapsed +TOTAL : 1.497134 sec + 4,549,359,025 cycles # 3.032 GHz + 13,755,898,061 instructions # 3.02 insn per cycle + 1.501295301 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.049905e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.051589e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.051589e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.000854e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.002553e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.002553e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.753863 sec - 2,147,980,052 cycles # 2.837 GHz - 4,819,413,658 instructions # 2.24 insn per cycle - 0.757858909 seconds time elapsed +TOTAL : 0.759453 sec + 2,151,217,111 cycles # 2.820 GHz + 4,818,966,673 instructions # 2.24 insn per cycle + 0.763529614 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.121398e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.123528e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.123528e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.076028e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.078137e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.078137e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.655569 sec - 1,875,337,702 cycles # 2.847 GHz - 4,276,013,202 instructions # 2.28 insn per cycle - 0.659452126 seconds time elapsed +TOTAL : 0.658855 sec + 1,875,464,841 cycles # 2.832 GHz + 4,275,819,002 instructions # 2.28 insn per cycle + 0.662852680 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.258028e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.260328e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.260328e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.283691e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.286276e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.286276e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.732438 sec - 1,358,895,231 cycles # 1.851 GHz - 2,165,631,438 instructions # 1.59 insn per cycle - 0.736476884 seconds time elapsed +TOTAL : 0.730286 sec + 1,357,956,935 cycles # 1.851 GHz + 2,164,994,730 instructions # 1.59 insn per cycle + 0.734341079 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3475) (512y: 34) (512z:79492) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 67db6760e6..c2c8a96928 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-03_19:16:34 +DATE: 2023-11-08_21:31:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.697393e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.698008e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.698206e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.686778e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.687273e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.687409e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.168263 sec - 7,466,161,453 cycles # 3.002 GHz - 16,782,968,221 instructions # 2.25 insn per cycle - 2.544374597 seconds time elapsed +TOTAL : 2.171824 sec + 7,456,169,166 cycles # 2.995 GHz + 14,898,137,129 instructions # 2.00 insn per cycle + 2.549362993 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.111494e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.111753e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.111788e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.112892e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.113171e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113203e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.403934 sec - 11,261,999,951 cycles # 3.015 GHz - 23,279,217,600 instructions # 2.07 insn per cycle - 3.795199307 seconds time elapsed +TOTAL : 3.401203 sec + 11,249,483,891 cycles # 3.009 GHz + 24,262,391,957 instructions # 2.16 insn per cycle + 3.794357278 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.891205e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.891420e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.891420e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.772311e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.772526e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.772526e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.695697 sec - 19,121,802,644 cycles # 2.855 GHz - 54,152,938,154 instructions # 2.83 insn per cycle - 6.699723618 seconds time elapsed +TOTAL : 6.810863 sec + 19,135,542,784 cycles # 2.808 GHz + 54,153,577,866 instructions # 2.83 insn per cycle + 6.814854998 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32066) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.589938e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.590022e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.590022e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.589475e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.589562e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.589562e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.327144 sec - 9,411,187,085 cycles # 2.826 GHz - 26,159,441,613 instructions # 2.78 insn per cycle - 3.331341639 seconds time elapsed +TOTAL : 3.327738 sec + 9,417,973,850 cycles # 2.827 GHz + 26,159,432,180 instructions # 2.78 insn per cycle + 3.331899471 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96005) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.556465e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.556911e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.556911e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.728829e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.729288e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.729288e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.488421 sec - 4,038,495,427 cycles # 2.707 GHz - 9,228,280,089 instructions # 2.29 insn per cycle - 1.492543554 seconds time elapsed +TOTAL : 1.420979 sec + 4,041,656,459 cycles # 2.838 GHz + 9,227,906,681 instructions # 2.28 insn per cycle + 1.425059392 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.276116e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.276827e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.276827e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.219686e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.220314e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.220314e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.240270 sec - 3,525,917,357 cycles # 2.835 GHz - 8,175,363,577 instructions # 2.32 insn per cycle - 1.244573424 seconds time elapsed +TOTAL : 1.256653 sec + 3,545,597,499 cycles # 2.814 GHz + 8,175,250,543 instructions # 2.31 insn per cycle + 1.260805357 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.671636e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.672174e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.672174e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.660023e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.660558e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.660558e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.443903 sec - 2,654,961,238 cycles # 1.834 GHz - 4,155,116,507 instructions # 1.57 insn per cycle - 1.448186385 seconds time elapsed +TOTAL : 1.447622 sec + 2,657,673,224 cycles # 1.832 GHz + 4,154,915,823 instructions # 1.56 insn per cycle + 1.451764331 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2045) (512y: 93) (512z:78760) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index ba876e5994..485a0059f2 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-03_19:17:35 +DATE: 2023-11-08_21:32:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.679011e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.679665e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.679866e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.688491e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.689012e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.689176e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.173579 sec - 7,474,410,637 cycles # 3.001 GHz - 15,946,585,145 instructions # 2.13 insn per cycle - 2.550103231 seconds time elapsed +TOTAL : 2.168620 sec + 7,451,542,055 cycles # 2.994 GHz + 15,551,253,703 instructions # 2.09 insn per cycle + 2.545633518 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.109202e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.109461e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.109492e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.107863e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108135e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108171e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.402138 sec - 11,227,553,919 cycles # 3.005 GHz - 23,286,904,291 instructions # 2.07 insn per cycle - 3.792137186 seconds time elapsed +TOTAL : 3.405108 sec + 11,192,783,578 cycles # 3.001 GHz + 25,734,796,379 instructions # 2.30 insn per cycle + 3.786804275 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.862068e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.862272e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.862272e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.066104e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.066369e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.066369e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.723543 sec - 19,074,467,052 cycles # 2.836 GHz - 54,156,087,092 instructions # 2.84 insn per cycle - 6.727488337 seconds time elapsed +TOTAL : 6.548059 sec + 19,079,779,477 cycles # 2.913 GHz + 54,153,651,610 instructions # 2.84 insn per cycle + 6.552064899 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32243) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.568667e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.568765e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.568765e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.589149e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.589238e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.589238e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.375716 sec - 9,382,313,393 cycles # 2.776 GHz - 26,079,058,590 instructions # 2.78 insn per cycle - 3.379999018 seconds time elapsed +TOTAL : 3.327579 sec + 9,382,040,636 cycles # 2.817 GHz + 26,078,619,591 instructions # 2.78 insn per cycle + 3.331633706 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:95899) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.662540e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.663002e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.663002e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.662193e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.662639e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.662639e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.448110 sec - 4,074,555,185 cycles # 2.807 GHz - 9,213,769,276 instructions # 2.26 insn per cycle - 1.452285529 seconds time elapsed +TOTAL : 1.447113 sec + 4,073,138,574 cycles # 2.808 GHz + 9,213,586,675 instructions # 2.26 insn per cycle + 1.451209760 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.250454e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.251202e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.251202e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.194379e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.195039e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.195039e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.248074 sec - 3,536,570,557 cycles # 2.826 GHz - 8,168,521,757 instructions # 2.31 insn per cycle - 1.252256213 seconds time elapsed +TOTAL : 1.264023 sec + 3,548,672,085 cycles # 2.800 GHz + 8,168,128,611 instructions # 2.30 insn per cycle + 1.268138683 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.691090e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.691677e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.691677e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.707082e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.707666e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.707666e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.437256 sec - 2,622,132,529 cycles # 1.820 GHz - 4,153,851,791 instructions # 1.58 insn per cycle - 1.441375266 seconds time elapsed +TOTAL : 1.430570 sec + 2,620,935,291 cycles # 1.830 GHz + 4,154,056,327 instructions # 1.58 insn per cycle + 1.434770233 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1492) (512y: 175) (512z:78776) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 32c5e2345e..45ec48d9b4 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-03_19:11:10 +DATE: 2023-11-08_21:26:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.931878e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.341004e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.663503e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.850720e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.319691e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.646421e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.446607 sec - 1,970,164,515 cycles # 2.938 GHz - 2,759,248,123 instructions # 1.40 insn per cycle - 0.729204009 seconds time elapsed +TOTAL : 0.445414 sec + 1,963,940,666 cycles # 2.941 GHz + 2,761,951,187 instructions # 1.41 insn per cycle + 0.725454441 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.710415e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.163714e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.497427e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.571453e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.132541e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.489040e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.523022 sec - 2,217,601,456 cycles # 2.938 GHz - 3,205,519,009 instructions # 1.45 insn per cycle - 0.813078242 seconds time elapsed +TOTAL : 0.525989 sec + 2,266,225,819 cycles # 2.950 GHz + 3,255,459,976 instructions # 1.44 insn per cycle + 0.825689166 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.073669e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.096220e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.074272e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.096702e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096702e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.548155 sec - 4,698,700,649 cycles # 3.029 GHz - 13,467,797,998 instructions # 2.87 insn per cycle - 1.552304744 seconds time elapsed +TOTAL : 1.547835 sec + 4,705,088,880 cycles # 3.034 GHz + 13,467,070,551 instructions # 2.86 insn per cycle + 1.551905661 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.948763e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.021816e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.021816e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.836387e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.906822e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.906822e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.862036 sec - 2,624,478,574 cycles # 3.032 GHz - 7,556,486,050 instructions # 2.88 insn per cycle - 0.866308924 seconds time elapsed +TOTAL : 0.914850 sec + 2,629,820,703 cycles # 2.863 GHz + 7,555,643,977 instructions # 2.87 insn per cycle + 0.919312372 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3095) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.306326e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.524533e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.524533e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.179916e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.388522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.388522e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.517734 sec - 1,480,526,951 cycles # 2.839 GHz - 3,123,082,416 instructions # 2.11 insn per cycle - 0.522085763 seconds time elapsed +TOTAL : 0.538121 sec + 1,483,909,982 cycles # 2.739 GHz + 3,122,112,991 instructions # 2.10 insn per cycle + 0.542506000 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.669407e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.933881e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.933881e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.492769e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.748148e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.748148e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.468132 sec - 1,341,729,382 cycles # 2.844 GHz - 2,984,537,487 instructions # 2.22 insn per cycle - 0.472335074 seconds time elapsed +TOTAL : 0.492302 sec + 1,352,205,323 cycles # 2.727 GHz + 2,983,986,621 instructions # 2.21 insn per cycle + 0.496759795 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.279474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.384367e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.384367e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.316160e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.426685e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.426685e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.743758 sec - 1,327,382,690 cycles # 1.776 GHz - 1,956,119,028 instructions # 1.47 insn per cycle - 0.747985259 seconds time elapsed +TOTAL : 0.732612 sec + 1,330,714,647 cycles # 1.807 GHz + 1,956,053,126 instructions # 1.47 insn per cycle + 0.737097876 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 83cbc116b3..9573fdc8ac 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-03_19:36:51 +DATE: 2023-11-08_21:53:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.568026e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.132079e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.132079e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.674751e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.241786e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.241786e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.473711 sec - 2,006,451,769 cycles # 2.929 GHz - 2,970,353,925 instructions # 1.48 insn per cycle - 0.742629859 seconds time elapsed +TOTAL : 0.472075 sec + 2,011,630,446 cycles # 2.946 GHz + 2,977,593,506 instructions # 1.48 insn per cycle + 0.740354864 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.250433e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.283042e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.283042e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.306214e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.374405e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.374405e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.748674 sec - 3,002,657,574 cycles # 2.966 GHz - 4,543,695,427 instructions # 1.51 insn per cycle - 1.069550305 seconds time elapsed +TOTAL : 0.746007 sec + 2,930,819,015 cycles # 2.951 GHz + 4,513,689,699 instructions # 1.54 insn per cycle + 1.051041659 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.069178e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.091931e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.091931e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.067462e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.089978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.089978e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.560726 sec - 4,731,718,585 cycles # 3.025 GHz - 13,472,168,375 instructions # 2.85 insn per cycle - 1.565141837 seconds time elapsed +TOTAL : 1.563342 sec + 4,743,647,659 cycles # 3.027 GHz + 13,474,115,002 instructions # 2.84 insn per cycle + 1.567732700 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.899999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.973174e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.973174e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.931899e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.004806e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.004806e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.892296 sec - 2,670,244,018 cycles # 2.980 GHz - 7,605,526,435 instructions # 2.85 insn per cycle - 0.896907337 seconds time elapsed +TOTAL : 0.876421 sec + 2,657,928,129 cycles # 3.020 GHz + 7,605,320,089 instructions # 2.86 insn per cycle + 0.880831982 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3095) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.091835e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.296236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.296236e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.284426e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.500691e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.500691e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.561077 sec - 1,524,432,631 cycles # 2.698 GHz - 3,172,781,548 instructions # 2.08 insn per cycle - 0.565642937 seconds time elapsed +TOTAL : 0.528369 sec + 1,515,520,073 cycles # 2.846 GHz + 3,173,010,189 instructions # 2.09 insn per cycle + 0.533003329 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.608228e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.871141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.871141e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.626971e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.890157e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.890157e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.483758 sec - 1,382,209,807 cycles # 2.835 GHz - 3,035,256,040 instructions # 2.20 insn per cycle - 0.488244630 seconds time elapsed +TOTAL : 0.480865 sec + 1,378,241,594 cycles # 2.844 GHz + 3,034,725,088 instructions # 2.20 insn per cycle + 0.485339539 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.425183e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.544675e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.544675e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.445545e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.566504e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.566504e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.707575 sec - 1,368,070,277 cycles # 1.923 GHz - 1,995,483,449 instructions # 1.46 insn per cycle - 0.712159059 seconds time elapsed +TOTAL : 0.701984 sec + 1,365,857,372 cycles # 1.935 GHz + 1,995,672,274 instructions # 1.46 insn per cycle + 0.706431315 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 5c16312148..a982c1092c 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-03_19:11:27 +DATE: 2023-11-08_21:26:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.898292e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.236740e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.548470e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.808432e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.231946e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.554075e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444696 sec - 1,938,726,844 cycles # 2.937 GHz - 2,756,323,630 instructions # 1.42 insn per cycle - 0.718363875 seconds time elapsed +TOTAL : 0.446409 sec + 1,914,723,819 cycles # 2.858 GHz + 2,720,530,830 instructions # 1.42 insn per cycle + 0.726781250 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.682843e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.082328e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.409380e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.542836e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.030986e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.390683e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.522202 sec - 2,220,530,283 cycles # 2.941 GHz - 3,184,953,404 instructions # 1.43 insn per cycle - 0.811776517 seconds time elapsed +TOTAL : 0.529359 sec + 2,191,645,691 cycles # 2.864 GHz + 3,157,433,372 instructions # 1.44 insn per cycle + 0.823018193 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.070337e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092872e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.092872e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.036305e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.058434e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058434e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.552829 sec - 4,705,329,544 cycles # 3.023 GHz - 13,461,758,666 instructions # 2.86 insn per cycle - 1.556952692 seconds time elapsed +TOTAL : 1.603791 sec + 4,708,850,584 cycles # 2.929 GHz + 13,461,227,684 instructions # 2.86 insn per cycle + 1.607981971 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.948045e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.021952e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.021952e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.854678e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.928501e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.928501e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.862207 sec - 2,624,178,818 cycles # 3.031 GHz - 7,555,487,904 instructions # 2.88 insn per cycle - 0.866510467 seconds time elapsed +TOTAL : 0.906299 sec + 2,638,123,420 cycles # 2.899 GHz + 7,554,662,347 instructions # 2.86 insn per cycle + 0.910729092 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.292100e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.512278e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.512278e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.120658e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.331862e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331862e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.519779 sec - 1,479,324,919 cycles # 2.825 GHz - 3,121,432,800 instructions # 2.11 insn per cycle - 0.524166869 seconds time elapsed +TOTAL : 0.548282 sec + 1,490,121,110 cycles # 2.699 GHz + 3,120,571,278 instructions # 2.09 insn per cycle + 0.552853693 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.586783e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.851292e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.851292e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.460892e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.716719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.716719e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.479341 sec - 1,345,156,968 cycles # 2.785 GHz - 2,982,279,143 instructions # 2.22 insn per cycle - 0.483569808 seconds time elapsed +TOTAL : 0.496477 sec + 1,349,987,385 cycles # 2.699 GHz + 2,981,775,320 instructions # 2.21 insn per cycle + 0.500801099 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.481639e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.600263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.600263e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.283025e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.395178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.395178e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.683909 sec - 1,326,826,217 cycles # 1.930 GHz - 1,955,120,469 instructions # 1.47 insn per cycle - 0.688253496 seconds time elapsed +TOTAL : 0.742923 sec + 1,336,539,142 cycles # 1.791 GHz + 1,954,402,399 instructions # 1.46 insn per cycle + 0.747445158 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 59e9dbfb13..0870ac1612 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-03_19:11:45 +DATE: 2023-11-08_21:26:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.904199e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.231536e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.359887e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.731772e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.218499e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.346344e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.438419 sec - 1,915,720,301 cycles # 2.940 GHz - 2,722,845,778 instructions # 1.42 insn per cycle - 0.708695201 seconds time elapsed +TOTAL : 0.443647 sec + 1,860,995,101 cycles # 2.829 GHz + 2,577,640,181 instructions # 1.39 insn per cycle + 0.715495532 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.256707e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.834983e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.952518e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.975472e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.830515e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.954464e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.473385 sec - 2,068,832,196 cycles # 2.955 GHz - 2,965,580,704 instructions # 1.43 insn per cycle - 0.757067346 seconds time elapsed +TOTAL : 0.479313 sec + 1,996,286,635 cycles # 2.831 GHz + 2,879,932,210 instructions # 1.44 insn per cycle + 0.762534142 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.135878e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.161149e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.161149e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.068560e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.092999e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.092999e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.462461 sec - 4,454,737,328 cycles # 3.039 GHz - 13,053,159,453 instructions # 2.93 insn per cycle - 1.466494148 seconds time elapsed +TOTAL : 1.555116 sec + 4,461,765,661 cycles # 2.863 GHz + 13,052,553,175 instructions # 2.93 insn per cycle + 1.559192669 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.046237e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238088e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238088e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.882925e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.070631e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.070631e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.557296 sec - 1,699,998,155 cycles # 3.031 GHz - 4,515,681,552 instructions # 2.66 insn per cycle - 0.561435544 seconds time elapsed +TOTAL : 0.589518 sec + 1,706,750,598 cycles # 2.878 GHz + 4,515,023,670 instructions # 2.65 insn per cycle + 0.593859816 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3601) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.648399e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.355867e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.355867e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.765834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.493743e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.493743e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.311054 sec - 851,131,460 cycles # 2.704 GHz - 1,899,263,660 instructions # 2.23 insn per cycle - 0.315235937 seconds time elapsed +TOTAL : 0.305319 sec + 853,645,854 cycles # 2.763 GHz + 1,898,477,314 instructions # 2.22 insn per cycle + 0.309705869 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.243995e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.098185e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.098185e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.141881e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.979826e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.979826e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.282792 sec - 800,211,416 cycles # 2.794 GHz - 1,822,370,089 instructions # 2.28 insn per cycle - 0.286974618 seconds time elapsed +TOTAL : 0.287752 sec + 800,772,449 cycles # 2.748 GHz + 1,821,769,219 instructions # 2.28 insn per cycle + 0.292040341 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe @@ -194,9 +194,9 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 29,217,754 cycles # 2.652 GHz - 42,284,295 instructions # 1.45 insn per cycle - 0.011406114 seconds time elapsed + 29,120,008 cycles # 2.647 GHz + 41,681,258 instructions # 1.43 insn per cycle + 0.011379573 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1969) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index f15afb12c1..0597ee22a3 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-03_19:37:10 +DATE: 2023-11-08_21:53:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.572083e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.023629e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.023629e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.639706e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.257120e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.257120e+07 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.454387 sec - 1,955,352,187 cycles # 2.938 GHz - 2,863,812,902 instructions # 1.46 insn per cycle - 0.722319097 seconds time elapsed +TOTAL : 0.449974 sec + 1,947,595,961 cycles # 2.942 GHz + 2,880,549,080 instructions # 1.48 insn per cycle + 0.719459148 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.087118e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.599283e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.599283e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.168463e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.812098e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.812098e+07 ) sec^-1 MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.623566 sec - 2,498,674,729 cycles # 2.923 GHz - 3,766,117,574 instructions # 1.51 insn per cycle - 0.913465239 seconds time elapsed +TOTAL : 0.616345 sec + 2,486,978,198 cycles # 2.935 GHz + 3,790,315,811 instructions # 1.52 insn per cycle + 0.904030203 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.124937e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.150391e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.150391e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.130080e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155980e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155980e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.480709 sec - 4,471,348,915 cycles # 3.013 GHz - 13,056,806,498 instructions # 2.92 insn per cycle - 1.485019275 seconds time elapsed +TOTAL : 1.473693 sec + 4,471,154,333 cycles # 3.027 GHz + 13,056,458,670 instructions # 2.92 insn per cycle + 1.477812555 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.015036e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.208624e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.208624e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.025004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.219703e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.219703e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.567631 sec - 1,721,622,943 cycles # 3.014 GHz - 4,563,283,810 instructions # 2.65 insn per cycle - 0.571796628 seconds time elapsed +TOTAL : 0.566150 sec + 1,723,667,712 cycles # 3.025 GHz + 4,563,297,886 instructions # 2.65 insn per cycle + 0.570436411 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3601) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.904492e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.650265e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.650265e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.858362e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.587732e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.587732e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.302253 sec - 872,846,100 cycles # 2.852 GHz - 1,935,401,156 instructions # 2.22 insn per cycle - 0.306655862 seconds time elapsed +TOTAL : 0.303895 sec + 871,800,602 cycles # 2.835 GHz + 1,935,423,519 instructions # 2.22 insn per cycle + 0.308064640 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.271441e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.120717e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.120717e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.340209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.201036e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.201036e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.285638 sec - 819,147,203 cycles # 2.831 GHz - 1,858,340,668 instructions # 2.27 insn per cycle - 0.289825539 seconds time elapsed +TOTAL : 0.282456 sec + 818,779,897 cycles # 2.862 GHz + 1,858,681,592 instructions # 2.27 insn per cycle + 0.286757422 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe @@ -211,9 +211,9 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) - 37,779,421 cycles # 2.664 GHz - 50,267,131 instructions # 1.33 insn per cycle - 0.014729622 seconds time elapsed + 37,403,629 cycles # 2.691 GHz + 50,469,890 instructions # 1.35 insn per cycle + 0.014372629 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1969) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index c8e32c45f6..1f88f16cf0 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-03_19:12:01 +DATE: 2023-11-08_21:26:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.816263e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.233557e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.356584e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.710525e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.199979e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326321e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.439029 sec - 1,906,384,387 cycles # 2.932 GHz - 2,668,630,925 instructions # 1.40 insn per cycle - 0.709025104 seconds time elapsed +TOTAL : 0.440656 sec + 1,914,804,492 cycles # 2.925 GHz + 2,653,138,253 instructions # 1.39 insn per cycle + 0.711749358 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.165457e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.788318e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.899924e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.891594e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.784645e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904203e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.475153 sec - 2,060,825,458 cycles # 2.945 GHz - 2,959,751,148 instructions # 1.44 insn per cycle - 0.758667305 seconds time elapsed +TOTAL : 0.476610 sec + 2,083,856,457 cycles # 2.940 GHz + 2,965,032,628 instructions # 1.42 insn per cycle + 0.765879589 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.129555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154905e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.154905e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.129719e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155174e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155174e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.470613 sec - 4,452,780,841 cycles # 3.021 GHz - 13,033,295,085 instructions # 2.93 insn per cycle - 1.474743963 seconds time elapsed +TOTAL : 1.470593 sec + 4,452,128,732 cycles # 3.020 GHz + 13,033,118,765 instructions # 2.93 insn per cycle + 1.474660881 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.000043e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.190804e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.190804e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.040157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.234537e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.234537e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.566289 sec - 1,691,331,084 cycles # 2.968 GHz - 4,511,809,710 instructions # 2.67 insn per cycle - 0.570477990 seconds time elapsed +TOTAL : 0.558718 sec + 1,691,566,910 cycles # 3.008 GHz + 4,511,110,866 instructions # 2.67 insn per cycle + 0.562886591 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.392978e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.034440e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.034440e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.942184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.690459e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.690459e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.325119 sec - 853,124,200 cycles # 2.596 GHz - 1,896,337,755 instructions # 2.22 insn per cycle - 0.329328797 seconds time elapsed +TOTAL : 0.296099 sec + 853,486,904 cycles # 2.847 GHz + 1,895,390,282 instructions # 2.22 insn per cycle + 0.300311325 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.399192e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.280649e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.280649e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.374489e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.242458e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.242458e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.275864 sec - 799,266,525 cycles # 2.860 GHz - 1,818,357,527 instructions # 2.28 insn per cycle - 0.279975539 seconds time elapsed +TOTAL : 0.277008 sec + 800,885,707 cycles # 2.855 GHz + 1,817,516,411 instructions # 2.27 insn per cycle + 0.281135474 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe @@ -194,9 +194,9 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 28,695,242 cycles # 2.686 GHz - 41,682,313 instructions # 1.45 insn per cycle - 0.011083970 seconds time elapsed + 28,754,068 cycles # 2.640 GHz + 40,955,371 instructions # 1.42 insn per cycle + 0.011419598 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1932) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 2f090614c3..d5ef07e007 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-03_19:12:17 +DATE: 2023-11-08_21:27:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.924011e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.312316e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.652376e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.821562e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.300473e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.628825e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444193 sec - 1,982,708,723 cycles # 2.960 GHz - 2,773,326,834 instructions # 1.40 insn per cycle - 0.727594315 seconds time elapsed +TOTAL : 0.447954 sec + 1,932,564,435 cycles # 2.921 GHz + 2,743,560,511 instructions # 1.42 insn per cycle + 0.719731147 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.716781e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.189044e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.525460e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.575286e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.143575e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.499311e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.521362 sec - 2,209,841,644 cycles # 2.939 GHz - 3,173,284,555 instructions # 1.44 insn per cycle - 0.811280771 seconds time elapsed +TOTAL : 0.523239 sec + 2,243,203,062 cycles # 2.949 GHz + 3,244,551,518 instructions # 1.45 insn per cycle + 0.818196957 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.069544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.093797e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.093797e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.069116e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.091100e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.091100e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.554251 sec - 4,735,824,731 cycles # 3.041 GHz - 13,470,683,397 instructions # 2.84 insn per cycle - 1.558385201 seconds time elapsed +TOTAL : 1.554535 sec + 4,725,018,841 cycles # 3.035 GHz + 13,469,753,614 instructions # 2.85 insn per cycle + 1.558693291 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.965218e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.040121e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.040121e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.970313e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.046371e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.046371e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.855114 sec - 2,601,303,673 cycles # 3.029 GHz - 7,389,579,625 instructions # 2.84 insn per cycle - 0.859411839 seconds time elapsed +TOTAL : 0.853333 sec + 2,596,868,107 cycles # 3.030 GHz + 7,388,624,187 instructions # 2.85 insn per cycle + 0.857591565 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.103178e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.304731e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.304731e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.332912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.554037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.554037e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.550950 sec - 1,470,989,933 cycles # 2.653 GHz - 3,058,765,662 instructions # 2.08 insn per cycle - 0.555184249 seconds time elapsed +TOTAL : 0.513899 sec + 1,466,763,063 cycles # 2.835 GHz + 3,057,876,447 instructions # 2.08 insn per cycle + 0.518107133 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.774277e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.060098e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060098e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.777029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.058907e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.058907e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.455765 sec - 1,309,522,407 cycles # 2.852 GHz - 2,933,428,757 instructions # 2.24 insn per cycle - 0.459981977 seconds time elapsed +TOTAL : 0.455720 sec + 1,306,910,741 cycles # 2.845 GHz + 2,932,818,419 instructions # 2.24 insn per cycle + 0.460076062 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.411920e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.526016e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.526016e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.391166e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.500870e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.500870e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.703353 sec - 1,366,582,014 cycles # 1.933 GHz - 1,972,774,215 instructions # 1.44 insn per cycle - 0.707707323 seconds time elapsed +TOTAL : 0.709219 sec + 1,365,455,058 cycles # 1.916 GHz + 1,971,797,344 instructions # 1.44 insn per cycle + 0.713482957 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index f9fb6155f7..6e69f82aee 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-03_19:12:35 +DATE: 2023-11-08_21:27:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.886874e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.228157e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.568514e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.812345e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.208019e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.520614e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444068 sec - 1,946,521,853 cycles # 2.951 GHz - 2,755,422,178 instructions # 1.42 insn per cycle - 0.717280231 seconds time elapsed +TOTAL : 0.446229 sec + 1,955,610,259 cycles # 2.936 GHz + 2,744,647,203 instructions # 1.40 insn per cycle + 0.725146174 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.675020e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.027076e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.349457e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.529337e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.985996e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.326246e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.523198 sec - 2,222,274,900 cycles # 2.946 GHz - 3,198,191,753 instructions # 1.44 insn per cycle - 0.813003520 seconds time elapsed +TOTAL : 0.522218 sec + 2,238,618,942 cycles # 2.943 GHz + 3,202,939,408 instructions # 1.43 insn per cycle + 0.817604471 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.069395e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.091866e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.091866e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.065182e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.087517e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.087517e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.554132 sec - 4,733,166,680 cycles # 3.039 GHz - 13,456,716,984 instructions # 2.84 insn per cycle - 1.558278315 seconds time elapsed +TOTAL : 1.560291 sec + 4,729,799,308 cycles # 3.025 GHz + 13,455,876,389 instructions # 2.84 insn per cycle + 1.564515481 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.963106e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.038064e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.038064e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.946971e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.020229e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.020229e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.856001 sec - 2,603,447,344 cycles # 3.028 GHz - 7,393,362,148 instructions # 2.84 insn per cycle - 0.860294166 seconds time elapsed +TOTAL : 0.863034 sec + 2,601,868,480 cycles # 3.003 GHz + 7,392,543,085 instructions # 2.84 insn per cycle + 0.867199240 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.354162e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.573385e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.573385e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.323539e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.538773e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.538773e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.509733 sec - 1,467,381,346 cycles # 2.859 GHz - 3,058,521,485 instructions # 2.08 insn per cycle - 0.513844239 seconds time elapsed +TOTAL : 0.514660 sec + 1,469,850,553 cycles # 2.835 GHz + 3,058,079,146 instructions # 2.08 insn per cycle + 0.519050232 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.783084e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.065773e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.065773e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.767525e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.049329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.049329e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.454796 sec - 1,307,019,802 cycles # 2.851 GHz - 2,934,565,738 instructions # 2.25 insn per cycle - 0.459066978 seconds time elapsed +TOTAL : 0.456721 sec + 1,309,025,943 cycles # 2.843 GHz + 2,933,534,120 instructions # 2.24 insn per cycle + 0.460967936 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.408065e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.519741e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.519741e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.405794e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.516831e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.516831e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.704354 sec - 1,368,218,437 cycles # 1.933 GHz - 1,972,609,636 instructions # 1.44 insn per cycle - 0.708886358 seconds time elapsed +TOTAL : 0.704869 sec + 1,364,487,579 cycles # 1.926 GHz + 1,971,713,310 instructions # 1.45 insn per cycle + 0.709028391 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe From bbeba6dec51d6ae7fe3021444ed3e3f9391736a1 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 10:13:34 +0100 Subject: [PATCH 06/14] [gpucpp] rerun 18 tmad tests, Olivier's patch now fixes crash #781 in ggttggg, no change in performance --- .../log_eemumu_mad_d_inl0_hrd0.txt | 136 ++--- .../log_eemumu_mad_f_inl0_hrd0.txt | 138 ++--- .../log_eemumu_mad_m_inl0_hrd0.txt | 136 ++--- .../log_ggtt_mad_d_inl0_hrd0.txt | 132 ++-- .../log_ggtt_mad_f_inl0_hrd0.txt | 138 ++--- .../log_ggtt_mad_m_inl0_hrd0.txt | 134 ++-- .../log_ggttg_mad_d_inl0_hrd0.txt | 138 ++--- .../log_ggttg_mad_f_inl0_hrd0.txt | 138 ++--- .../log_ggttg_mad_m_inl0_hrd0.txt | 134 ++-- .../log_ggttgg_mad_d_inl0_hrd0.txt | 134 ++-- .../log_ggttgg_mad_f_inl0_hrd0.txt | 138 ++--- .../log_ggttgg_mad_m_inl0_hrd0.txt | 136 ++--- .../log_ggttggg_mad_d_inl0_hrd0.txt | 572 +++++++++++++++++- .../log_ggttggg_mad_f_inl0_hrd0.txt | 572 +++++++++++++++++- .../log_ggttggg_mad_m_inl0_hrd0.txt | 572 +++++++++++++++++- .../log_gqttq_mad_d_inl0_hrd0.txt | 138 ++--- .../log_gqttq_mad_f_inl0_hrd0.txt | 134 ++-- .../log_gqttq_mad_m_inl0_hrd0.txt | 136 ++--- 18 files changed, 2685 insertions(+), 1071 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index bcf56600ba..383178f656 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -16,24 +16,24 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2023-11-03_19:52:13 +DATE: 2023-11-08_22:08:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6373s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6287s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6257s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6178s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1807s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1728s - [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1766s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1680s + [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.60E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4217s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3352s - [COUNTERS] Fortran MEs ( 1 ) : 0.0865s for 90112 events => throughput is 1.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3301s + [COUNTERS] Fortran MEs ( 1 ) : 0.0855s for 90112 events => throughput is 1.05E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1919s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1852s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 8192 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1878s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1815s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0063s for 8192 events => throughput is 1.30E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0722s for 90112 events => throughput is 1.25E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4131s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3430s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0701s for 90112 events => throughput is 1.29E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.217666e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.227734e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.241611e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.242066e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,8 +210,8 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1826s + [COUNTERS] PROGRAM TOTAL : 0.1813s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1774s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.10E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3926s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3476s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0451s for 90112 events => throughput is 2.00E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3861s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3422s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0439s for 90112 events => throughput is 2.05E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.991197e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.002470e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.990100e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.006601e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1864s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1832s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1828s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1796s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.57E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3800s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3465s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0335s for 90112 events => throughput is 2.69E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3731s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3402s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0329s for 90112 events => throughput is 2.74E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.603611e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.620678e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.718712e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.819190e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1833s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1805s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.93E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1815s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1784s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.71E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3774s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3449s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0324s for 90112 events => throughput is 2.78E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3723s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3407s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0316s for 90112 events => throughput is 2.85E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.713996e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.820321e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.775269e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.842053e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1890s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1855s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1819s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1785s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.38E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3894s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3496s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0398s for 90112 events => throughput is 2.26E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3824s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3420s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0404s for 90112 events => throughput is 2.23E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.190424e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.075096e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.183626e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.166357e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5997s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5992s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.68E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5934s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5929s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.63E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7696s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7647s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.85E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7863s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7814s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0050s for 90112 events => throughput is 1.81E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.173877e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.141020e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.893710e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.873271e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.716630e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.990853e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.387595e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.361218e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.739579e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.939860e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.929113e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.944408e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.693635e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.975323e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.118370e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.124184e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index ff3c2ae8d4..4b3b0b9b07 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -4,8 +4,8 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,25 +15,25 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2023-11-03_19:52:30 +DATE: 2023-11-08_22:08:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6418s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6338s - [COUNTERS] Fortran MEs ( 1 ) : 0.0080s for 8192 events => throughput is 1.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6276s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6195s + [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1827s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1742s - [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1778s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1697s + [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4264s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3383s - [COUNTERS] Fortran MEs ( 1 ) : 0.0882s for 90112 events => throughput is 1.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4139s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3290s + [COUNTERS] Fortran MEs ( 1 ) : 0.0849s for 90112 events => throughput is 1.06E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166087172673] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1909s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1845s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.27E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1874s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1813s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.33E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501907796603360E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4197s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3492s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0705s for 90112 events => throughput is 1.28E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4142s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3454s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0687s for 90112 events => throughput is 1.31E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.260485e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.261327e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.240620e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.287607e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1824s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1798s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1786s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1761s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3742s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3464s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0278s for 90112 events => throughput is 3.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3651s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3380s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0270s for 90112 events => throughput is 3.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.182676e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.137840e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.343050e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.298087e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1914s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1892s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.72E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1841s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3767s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3513s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0254s for 90112 events => throughput is 3.55E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3868s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3609s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0260s for 90112 events => throughput is 3.47E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.496883e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.442542e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.660390e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.634986e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1867s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1844s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.57E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1856s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1832s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.50E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3763s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3515s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 90112 events => throughput is 3.63E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3685s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3440s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0245s for 90112 events => throughput is 3.68E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.562187e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.588607e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.601892e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.872180e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166440400542] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1875s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1852s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.57E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1887s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1865s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.71E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501908978565555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3774s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3519s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0255s for 90112 events => throughput is 3.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3693s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3442s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0251s for 90112 events => throughput is 3.59E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.223682e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.372399e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.583359e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.586770e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5998s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5993s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.73E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5951s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.69E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7713s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7665s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.87E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7616s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7570s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 90112 events => throughput is 1.96E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.583398e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.577355e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.881767e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.822297e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.997979e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.937359e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.043514e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.046785e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.954785e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.102347e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.219791e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.203659e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.299152e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.365649e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.462264e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.422918e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 7741c53b46..9a947a36a5 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,8 +1,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_19:52:47 +DATE: 2023-11-08_22:08:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6387s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6300s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6186s + [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1817s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1737s - [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1781s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1703s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4238s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3373s - [COUNTERS] Fortran MEs ( 1 ) : 0.0865s for 90112 events => throughput is 1.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4162s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s + [COUNTERS] Fortran MEs ( 1 ) : 0.0853s for 90112 events => throughput is 1.06E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1953s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1883s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.17E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1898s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1832s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4433s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3677s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0757s for 90112 events => throughput is 1.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4158s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3438s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0720s for 90112 events => throughput is 1.25E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.177056e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.204267e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.187091e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.208788e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1964s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1922s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1829s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1790s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0038s for 8192 events => throughput is 2.13E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4148s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3686s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0462s for 90112 events => throughput is 1.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3821s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3399s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 90112 events => throughput is 2.14E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.000126e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.047978e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.127276e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.116427e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1855s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1824s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.62E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1871s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1841s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.76E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3807s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3465s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0342s for 90112 events => throughput is 2.63E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3733s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3393s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0340s for 90112 events => throughput is 2.65E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.610232e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.642107e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.645393e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.787956e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1900s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1870s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.68E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1804s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1776s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.89E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3808s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3484s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0325s for 90112 events => throughput is 2.78E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3708s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3395s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0313s for 90112 events => throughput is 2.88E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.740865e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.821887e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.846547e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.874115e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1860s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1826s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.46E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1829s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1796s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.43E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3896s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3515s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0381s for 90112 events => throughput is 2.36E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3842s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3465s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0377s for 90112 events => throughput is 2.39E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.142678e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.237740e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.406082e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.366800e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5998s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5993s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.60E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5935s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5930s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7721s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7672s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.85E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7704s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7654s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0050s for 90112 events => throughput is 1.82E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.181977e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.007927e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.926668e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.918411e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.726329e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018629e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.399920e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.348012e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.694690e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.994146e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.877527e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.917104e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.708142e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.983673e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.118945e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.123333e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 1c30dae812..3e628018af 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -17,8 +17,6 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' @@ -30,10 +28,12 @@ make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2023-11-03_19:53:04 +DATE: 2023-11-08_22:09:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3686s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3264s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3517s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3111s + [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3158s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2736s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3086s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2681s + [COUNTERS] Fortran MEs ( 1 ) : 0.0405s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6988s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2402s - [COUNTERS] Fortran MEs ( 1 ) : 0.4586s for 90112 events => throughput is 1.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6533s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2091s + [COUNTERS] Fortran MEs ( 1 ) : 0.4442s for 90112 events => throughput is 2.03E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3516s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3139s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0377s for 8192 events => throughput is 2.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3456s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3078s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0379s for 8192 events => throughput is 2.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7148s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2977s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4171s for 90112 events => throughput is 2.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6716s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2645s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4071s for 90112 events => throughput is 2.21E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.143201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.224417e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.178576e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.212367e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3233s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3012s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3132s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2919s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.83E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5228s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2789s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2439s for 90112 events => throughput is 3.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4780s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2422s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2359s for 90112 events => throughput is 3.82E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.615837e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.777989e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.718240e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.740213e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,8 +286,8 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3023s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2891s + [COUNTERS] PROGRAM TOTAL : 0.2955s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.21E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4266s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2769s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1496s for 90112 events => throughput is 6.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3915s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2460s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1455s for 90112 events => throughput is 6.19E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.870487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.030466e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.072305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.192047e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3006s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2887s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0119s for 8192 events => throughput is 6.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2946s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2826s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0119s for 8192 events => throughput is 6.88E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4026s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2691s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1335s for 90112 events => throughput is 6.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3659s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2362s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1297s for 90112 events => throughput is 6.95E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.610205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.841360e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.622254e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.816529e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3191s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2997s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.23E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3142s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0191s for 8192 events => throughput is 4.29E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5054s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2826s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2228s for 90112 events => throughput is 4.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4624s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2517s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2106s for 90112 events => throughput is 4.28E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.911690e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.955720e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.045481e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.094472e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7037s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7032s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6940s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6935s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6871s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6805s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 90112 events => throughput is 1.37E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7032s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6968s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.043596e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.103744e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.671088e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.691695e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.005777e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.194593e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.074802e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.070229e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.019573e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.168601e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.147636e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149757e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.014036e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.190999e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.011683e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.017633e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 7edcebceb9..0321a276a0 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 + +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_19:53:30 +DATE: 2023-11-08_22:09:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3667s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3245s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3489s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3083s + [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3249s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2811s - [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3073s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2670s + [COUNTERS] Fortran MEs ( 1 ) : 0.0403s for 8192 events => throughput is 2.03E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7464s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2748s - [COUNTERS] Fortran MEs ( 1 ) : 0.4716s for 90112 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6502s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2060s + [COUNTERS] Fortran MEs ( 1 ) : 0.4442s for 90112 events => throughput is 2.03E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690706767555099] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3467s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3115s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0352s for 8192 events => throughput is 2.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3425s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3079s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0345s for 8192 events => throughput is 2.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782605295497] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6806s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2908s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3898s for 90112 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6631s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2770s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3861s for 90112 events => throughput is 2.33E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.279168e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.342613e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.299428e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319125e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690702885183541] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3091s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3002s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2858s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0145s for 8192 events => throughput is 5.66E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223778858016772] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4417s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2764s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1652s for 90112 events => throughput is 5.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3973s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2359s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1614s for 90112 events => throughput is 5.58E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.234141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.270911e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.323283e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.359921e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2903s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2825s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2834s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2758s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3699s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2805s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0894s for 90112 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3197s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2356s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0842s for 90112 events => throughput is 1.07E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.010480e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.026437e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.003913e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.028771e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2913s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2842s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2894s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2821s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3439s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2627s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0812s for 90112 events => throughput is 1.11E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3159s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2370s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0789s for 90112 events => throughput is 1.14E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.090791e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.095999e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.092579e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.120004e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690698914467276] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2963s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2859s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0104s for 8192 events => throughput is 7.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2909s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2810s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0098s for 8192 events => throughput is 8.33E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223780273983500] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3867s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2714s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1153s for 90112 events => throughput is 7.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4173s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2979s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1195s for 90112 events => throughput is 7.54E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.366599e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.668644e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.487198e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.548978e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690703397697980] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7024s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7018s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.52E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6943s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6937s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.51E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6918s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6861s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0056s for 90112 events => throughput is 1.60E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6513s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6459s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 90112 events => throughput is 1.67E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.243778e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.266713e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.844714e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.234896e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.837802e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.830084e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.769339e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.762403e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.775138e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.776301e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.863954e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.872477e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.397746e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.374142e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.449606e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.426544e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 30dac17633..8bacc65fe8 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 + +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -17,13 +17,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_19:53:55 +DATE: 2023-11-08_22:09:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3561s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3153s - [COUNTERS] Fortran MEs ( 1 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3627s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3194s + [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3103s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2697s - [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3074s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2660s + [COUNTERS] Fortran MEs ( 1 ) : 0.0414s for 8192 events => throughput is 1.98E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6795s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2275s - [COUNTERS] Fortran MEs ( 1 ) : 0.4521s for 90112 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6907s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2365s + [COUNTERS] Fortran MEs ( 1 ) : 0.4542s for 90112 events => throughput is 1.98E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3522s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3138s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0384s for 8192 events => throughput is 2.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3443s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3074s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 8192 events => throughput is 2.22E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7156s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2968s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4188s for 90112 events => throughput is 2.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6798s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2683s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4115s for 90112 events => throughput is 2.19E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.113023e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.164831e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.146418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.183670e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3199s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2989s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0210s for 8192 events => throughput is 3.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3143s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0208s for 8192 events => throughput is 3.95E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5211s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2871s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2339s for 90112 events => throughput is 3.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4761s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2466s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2295s for 90112 events => throughput is 3.93E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.687467e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.799865e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.724259e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.756525e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3043s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2913s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0129s for 8192 events => throughput is 6.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2976s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2844s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.21E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4192s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2746s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1446s for 90112 events => throughput is 6.23E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4201s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2736s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1464s for 90112 events => throughput is 6.15E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.051901e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.181937e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.195854e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.243573e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2995s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2875s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0120s for 8192 events => throughput is 6.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2977s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2865s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.30E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3959s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2664s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1296s for 90112 events => throughput is 6.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3670s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1262s for 90112 events => throughput is 7.14E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.842430e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.933959e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.007264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.064349e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3146s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2960s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0186s for 8192 events => throughput is 4.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3083s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2895s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0188s for 8192 events => throughput is 4.35E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5378s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3142s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2236s for 90112 events => throughput is 4.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4519s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2484s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2036s for 90112 events => throughput is 4.43E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.894022e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.266660e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.946552e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.117226e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708266690699] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7061s + [COUNTERS] PROGRAM TOTAL : 0.6949s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6943s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6929s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6862s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 90112 events => throughput is 1.34E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6539s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6476s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0063s for 90112 events => throughput is 1.42E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.049753e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.049281e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.613651e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.529307e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.019403e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.148817e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.060699e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.053163e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.995962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.170472e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.142982e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.130394e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.026315e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186789e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.022885e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.035076e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d992721ecf..09e16e6057 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -2,12 +2,12 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - -make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_19:54:21 +DATE: 2023-11-08_22:10:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5463s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2264s - [COUNTERS] Fortran MEs ( 1 ) : 0.3199s for 8192 events => throughput is 2.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5436s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2280s + [COUNTERS] Fortran MEs ( 1 ) : 0.3156s for 8192 events => throughput is 2.60E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5423s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2222s - [COUNTERS] Fortran MEs ( 1 ) : 0.3201s for 8192 events => throughput is 2.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5326s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2186s + [COUNTERS] Fortran MEs ( 1 ) : 0.3141s for 8192 events => throughput is 2.61E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.9241s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4090s - [COUNTERS] Fortran MEs ( 1 ) : 3.5151s for 90112 events => throughput is 2.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.9133s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4066s + [COUNTERS] Fortran MEs ( 1 ) : 3.5067s for 90112 events => throughput is 2.57E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470791E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8783s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5509s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3274s for 8192 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8544s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5319s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3225s for 8192 events => throughput is 2.54E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.3304s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7125s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6180s for 90112 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3255s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7008s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6247s for 90112 events => throughput is 2.49E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.563855e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.590377e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.539633e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.610150e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5609s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3903s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1705s for 8192 events => throughput is 4.80E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5624s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3861s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1763s for 8192 events => throughput is 4.65E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.4794s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5811s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8984s for 90112 events => throughput is 4.75E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.3972s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5470s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8502s for 90112 events => throughput is 4.87E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.820475e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.010592e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.874297e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.958333e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3928s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3073s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0854s for 8192 events => throughput is 9.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2982s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 8192 events => throughput is 9.79E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.4294s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4857s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9437s for 90112 events => throughput is 9.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.3684s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4497s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9187s for 90112 events => throughput is 9.81E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.717012e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.953639e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.756457e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002866e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3746s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2982s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0765s for 8192 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3650s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2906s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0744s for 8192 events => throughput is 1.10E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3655s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5058s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8597s for 90112 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2634s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4412s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8222s for 90112 events => throughput is 1.10E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.094100e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.117525e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.081248e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.126876e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4370s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3297s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1073s for 8192 events => throughput is 7.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4269s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3231s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1039s for 8192 events => throughput is 7.89E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6869s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5079s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1790s for 90112 events => throughput is 7.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6060s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4689s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1371s for 90112 events => throughput is 7.92E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.730653e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.896705e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.578143e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.740238e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6799s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6745s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0053s for 8192 events => throughput is 1.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6527s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6472s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8667s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8438s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0229s for 90112 events => throughput is 3.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.8560s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8329s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0231s for 90112 events => throughput is 3.90E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.611230e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.624902e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.333105e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.902263e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.644038e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.850642e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.240451e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.238047e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.653799e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.868590e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.251657e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.248755e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.651458e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.862444e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.754830e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.745100e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index a339973536..1a98ebc0f5 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_19:55:03 +DATE: 2023-11-08_22:11:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5498s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2259s - [COUNTERS] Fortran MEs ( 1 ) : 0.3239s for 8192 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5362s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2200s + [COUNTERS] Fortran MEs ( 1 ) : 0.3162s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5475s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2238s - [COUNTERS] Fortran MEs ( 1 ) : 0.3236s for 8192 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5340s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2185s + [COUNTERS] Fortran MEs ( 1 ) : 0.3154s for 8192 events => throughput is 2.60E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.9843s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4284s - [COUNTERS] Fortran MEs ( 1 ) : 3.5559s for 90112 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.8590s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3887s + [COUNTERS] Fortran MEs ( 1 ) : 3.4703s for 90112 events => throughput is 2.60E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196349765248158E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8606s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5403s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3204s for 8192 events => throughput is 2.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8380s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5255s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3125s for 8192 events => throughput is 2.62E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310860767768514E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.2449s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7120s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5329s for 90112 events => throughput is 2.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.1166s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6696s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.4470s for 90112 events => throughput is 2.61E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.612374e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.677117e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.564881e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.693750e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196334183509370E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4339s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3327s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1012s for 8192 events => throughput is 8.10E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4030s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3096s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0934s for 8192 events => throughput is 8.77E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310847547651041E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.5445s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4937s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0508s for 90112 events => throughput is 8.58E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.4739s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4457s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0282s for 90112 events => throughput is 8.76E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.676181e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.839523e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.776153e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.853955e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3149s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2698s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0450s for 8192 events => throughput is 1.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3025s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2591s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9260s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4419s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4841s for 90112 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8724s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4009s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4715s for 90112 events => throughput is 1.91E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.865505e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.919418e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.837629e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922480e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3024s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2625s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0399s for 8192 events => throughput is 2.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2944s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2562s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0382s for 8192 events => throughput is 2.14E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8768s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4395s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4373s for 90112 events => throughput is 2.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8215s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3936s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4279s for 90112 events => throughput is 2.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.065719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.114883e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.103855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.107711e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196344079460428E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3291s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2768s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0523s for 8192 events => throughput is 1.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3218s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2710s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0508s for 8192 events => throughput is 1.61E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310857804286998E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0319s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4573s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5745s for 90112 events => throughput is 1.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9668s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4146s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5522s for 90112 events => throughput is 1.63E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.561181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.619298e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.560141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.625264e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196349366365994E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6502s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6494s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 9.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6443s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6435s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 9.56E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310864949473968E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8485s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8390s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 90112 events => throughput is 9.41E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.7852s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7757s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 90112 events => throughput is 9.54E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.292780e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275339e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.862148e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.852966e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.637111e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.672301e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.443658e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.329588e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.653596e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.661199e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.515346e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.474053e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.504423e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.511679e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.620516e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.616407e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 0d971ecde6..b41396f75b 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_19:55:40 +DATE: 2023-11-08_22:11:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5559s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2317s - [COUNTERS] Fortran MEs ( 1 ) : 0.3242s for 8192 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5361s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2201s + [COUNTERS] Fortran MEs ( 1 ) : 0.3160s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5470s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2235s - [COUNTERS] Fortran MEs ( 1 ) : 0.3235s for 8192 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5352s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2185s + [COUNTERS] Fortran MEs ( 1 ) : 0.3167s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.9714s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4219s - [COUNTERS] Fortran MEs ( 1 ) : 3.5496s for 90112 events => throughput is 2.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.8603s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3884s + [COUNTERS] Fortran MEs ( 1 ) : 3.4719s for 90112 events => throughput is 2.60E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358763382007E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8877s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5532s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3345s for 8192 events => throughput is 2.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8721s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5420s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3301s for 8192 events => throughput is 2.48E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.5218s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7614s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7604s for 90112 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.2894s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6845s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6049s for 90112 events => throughput is 2.50E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.427313e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.562016e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.496439e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.546299e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358804670396E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5567s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3892s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1675s for 8192 events => throughput is 4.89E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5435s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3795s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1640s for 8192 events => throughput is 4.99E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.4587s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5767s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8820s for 90112 events => throughput is 4.79E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.3591s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5386s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8206s for 90112 events => throughput is 4.95E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.968795e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.765208e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.959892e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.784106e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3947s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3085s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0862s for 8192 events => throughput is 9.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4043s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0888s for 8192 events => throughput is 9.23E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.4507s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4993s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9513s for 90112 events => throughput is 9.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.3898s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4601s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9297s for 90112 events => throughput is 9.69E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.685236e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002689e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.962312e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.001815e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3728s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2978s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0750s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3655s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0732s for 8192 events => throughput is 1.12E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.2949s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4713s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8237s for 90112 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2429s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4365s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8064s for 90112 events => throughput is 1.12E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.124486e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.134514e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.126890e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.146843e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358757578441E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4452s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3335s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1117s for 8192 events => throughput is 7.34E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4312s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3234s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1077s for 8192 events => throughput is 7.60E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872803699391E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7230s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5103s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2127s for 90112 events => throughput is 7.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6602s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4811s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1791s for 90112 events => throughput is 7.64E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.441126e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.628154e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.419166e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.726777e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358102981245E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6594s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6540s + [COUNTERS] PROGRAM TOTAL : 0.6526s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6472s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8526s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8298s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.8190s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7961s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0229s for 90112 events => throughput is 3.93E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.626262e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.619555e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.888012e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.404025e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.627419e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.847979e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.234131e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.233328e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.606969e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.825056e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.246896e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.244373e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.626608e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.833245e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.728520e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.724277e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index ba8c60f62e..e6041006eb 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_19:56:23 +DATE: 2023-11-08_22:12:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.4568s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2815s - [COUNTERS] Fortran MEs ( 1 ) : 4.1753s for 8192 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.3823s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2780s + [COUNTERS] Fortran MEs ( 1 ) : 4.1043s for 8192 events => throughput is 2.00E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.5175s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2787s - [COUNTERS] Fortran MEs ( 1 ) : 4.2387s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.3581s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2728s + [COUNTERS] Fortran MEs ( 1 ) : 4.0853s for 8192 events => throughput is 2.01E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 48.0120s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9235s - [COUNTERS] Fortran MEs ( 1 ) : 46.0885s for 90112 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.0624s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8747s + [COUNTERS] Fortran MEs ( 1 ) : 45.1877s for 90112 events => throughput is 1.99E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.7799s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4663s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3136s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.6032s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3774s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2258s for 8192 events => throughput is 1.94E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 53.8857s - [COUNTERS] Fortran Overhead ( 0 ) : 6.1301s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.7557s for 90112 events => throughput is 1.89E+03 events/s + [COUNTERS] PROGRAM TOTAL : 52.5656s + [COUNTERS] Fortran Overhead ( 0 ) : 5.9752s + [COUNTERS] CudaCpp MEs ( 2 ) : 46.5903s for 90112 events => throughput is 1.93E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.953970e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.002618e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.950653e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.000666e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.8228s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5191s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3037s for 8192 events => throughput is 3.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6983s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4517s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2466s for 8192 events => throughput is 3.65E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 29.7001s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1956s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.5045s for 90112 events => throughput is 3.53E+03 events/s + [COUNTERS] PROGRAM TOTAL : 29.0395s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1529s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.8866s for 90112 events => throughput is 3.62E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.686347e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.775162e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.681541e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.752647e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.2608s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2531s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0077s for 8192 events => throughput is 8.13E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.2090s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2291s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9799s for 8192 events => throughput is 8.36E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.8799s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8850s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.9950s for 90112 events => throughput is 8.20E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.6019s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8333s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.7686s for 90112 events => throughput is 8.37E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.425637e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.622945e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.448586e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.637406e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.0082s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1311s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8771s for 8192 events => throughput is 9.34E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.9647s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1065s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 8192 events => throughput is 9.54E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.4208s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7744s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.6464s for 90112 events => throughput is 9.34E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.2465s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7171s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.5294s for 90112 events => throughput is 9.46E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.625406e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.867536e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.599473e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.834174e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.4768s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3764s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1005s for 8192 events => throughput is 7.44E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4062s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3349s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0712s for 8192 events => throughput is 7.65E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.3944s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0207s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.3737s for 90112 events => throughput is 7.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 14.7127s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9424s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.7703s for 90112 events => throughput is 7.66E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.487218e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.671946e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.501573e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.485706e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8150s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7821s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8073s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7752s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.55E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.7813s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4228s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3586s for 90112 events => throughput is 2.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7243s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3746s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3497s for 90112 events => throughput is 2.58E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.281506e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.290435e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.519229e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.518069e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.106281e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109074e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.149081e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.162766e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.098811e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.119359e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.169654e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.170946e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.104970e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.114486e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.438070e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.433160e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 2c58d8399d..a18920ba3f 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_20:00:40 +DATE: 2023-11-08_22:16:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.4730s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2806s - [COUNTERS] Fortran MEs ( 1 ) : 4.1924s for 8192 events => throughput is 1.95E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4492s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2726s + [COUNTERS] Fortran MEs ( 1 ) : 4.1766s for 8192 events => throughput is 1.96E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.4924s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2814s - [COUNTERS] Fortran MEs ( 1 ) : 4.2110s for 8192 events => throughput is 1.95E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.3607s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2703s + [COUNTERS] Fortran MEs ( 1 ) : 4.0903s for 8192 events => throughput is 2.00E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 48.0870s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9193s - [COUNTERS] Fortran MEs ( 1 ) : 46.1676s for 90112 events => throughput is 1.95E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.0727s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8744s + [COUNTERS] Fortran MEs ( 1 ) : 45.1984s for 90112 events => throughput is 1.99E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277396490802749E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.5167s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3246s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.1920s for 8192 events => throughput is 1.95E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.3702s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2240s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1462s for 8192 events => throughput is 1.98E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803774602344628E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 52.0969s - [COUNTERS] Fortran Overhead ( 0 ) : 5.9741s - [COUNTERS] CudaCpp MEs ( 2 ) : 46.1228s for 90112 events => throughput is 1.95E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.9666s + [COUNTERS] Fortran Overhead ( 0 ) : 5.8905s + [COUNTERS] CudaCpp MEs ( 2 ) : 45.0761s for 90112 events => throughput is 2.00E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.036738e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.075529e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.035901e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.074082e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277389126121586E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.5366s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3964s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1402s for 8192 events => throughput is 7.18E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5244s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3710s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1534s for 8192 events => throughput is 7.10E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803771887543366E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.6834s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0490s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.6344s for 90112 events => throughput is 7.13E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.2999s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0272s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.2727s for 90112 events => throughput is 7.34E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.385848e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.487987e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.336063e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.461964e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.2706s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7693s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5013s for 8192 events => throughput is 1.63E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.2522s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7532s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4990s for 8192 events => throughput is 1.64E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.9611s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4052s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.5558s for 90112 events => throughput is 1.62E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.8843s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3862s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4981s for 90112 events => throughput is 1.64E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.671775e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.703770e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.674155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.715659e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.1460s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7047s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4413s for 8192 events => throughput is 1.86E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.1254s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6948s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4306s for 8192 events => throughput is 1.90E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.1917s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3395s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.8523s for 90112 events => throughput is 1.86E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.0325s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2899s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7425s for 90112 events => throughput is 1.90E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.912795e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.946675e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.909696e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.957212e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277396394633404E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.3662s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8206s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5456s for 8192 events => throughput is 1.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3221s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7944s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5278s for 8192 events => throughput is 1.55E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803777741065333E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 8.4389s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4516s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.9874s for 90112 events => throughput is 1.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.1973s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3930s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.8043s for 90112 events => throughput is 1.55E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.534307e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558982e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.484518e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.568288e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277400478491260E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.7763s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7549s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7705s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7491s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.83E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.6207s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3864s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2342s for 90112 events => throughput is 3.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5805s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3447s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2358s for 90112 events => throughput is 3.82E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.582914e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598757e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.939400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.937809e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.483584e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.495923e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.662803e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.725491e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.489429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.498449e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.631443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.660457e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.463590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.473649e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.531910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.522099e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 7032d72896..05db57554d 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -16,16 +16,16 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_20:04:02 +DATE: 2023-11-08_22:19:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.4626s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2774s - [COUNTERS] Fortran MEs ( 1 ) : 4.1852s for 8192 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.3676s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2775s + [COUNTERS] Fortran MEs ( 1 ) : 4.0901s for 8192 events => throughput is 2.00E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.4427s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2777s - [COUNTERS] Fortran MEs ( 1 ) : 4.1649s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4195s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2705s + [COUNTERS] Fortran MEs ( 1 ) : 4.1489s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 48.3675s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9183s - [COUNTERS] Fortran MEs ( 1 ) : 46.4493s for 90112 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.1152s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8703s + [COUNTERS] Fortran MEs ( 1 ) : 45.2450s for 90112 events => throughput is 1.99E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 9.0356s - [COUNTERS] Fortran Overhead ( 0 ) : 4.6432s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3924s for 8192 events => throughput is 1.87E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7049s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4327s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2722s for 8192 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725813026109E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 54.3841s - [COUNTERS] Fortran Overhead ( 0 ) : 6.2075s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.1766s for 90112 events => throughput is 1.87E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.0960s + [COUNTERS] Fortran Overhead ( 0 ) : 6.0891s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.0069s for 90112 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.891623e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.971437e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.924168e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.965809e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277430934464E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.7893s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5036s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2857s for 8192 events => throughput is 3.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7042s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4800s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2242s for 8192 events => throughput is 3.68E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725816246317E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 29.4680s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1631s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.3048s for 90112 events => throughput is 3.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.5105s + [COUNTERS] Fortran Overhead ( 0 ) : 4.0554s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.4551s for 90112 events => throughput is 3.68E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.703810e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.800834e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.713606e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.788503e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.2254s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2372s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9882s for 8192 events => throughput is 8.29E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.1858s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2226s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9633s for 8192 events => throughput is 8.50E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.9261s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8936s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.0324s for 90112 events => throughput is 8.17E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.5514s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8252s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.7262s for 90112 events => throughput is 8.40E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.503062e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.756273e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.519397e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.759413e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.0128s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1323s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8805s for 8192 events => throughput is 9.30E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.9510s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0980s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8530s for 8192 events => throughput is 9.60E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.3871s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7750s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.6121s for 90112 events => throughput is 9.37E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.1748s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7107s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4641s for 90112 events => throughput is 9.52E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.683983e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.859146e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.679001e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.890303e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.5013s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3879s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1134s for 8192 events => throughput is 7.36E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4412s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3447s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0965s for 8192 events => throughput is 7.47E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.3721s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0357s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.3363s for 90112 events => throughput is 7.30E+03 events/s + [COUNTERS] PROGRAM TOTAL : 14.7703s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9437s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.8266s for 90112 events => throughput is 7.62E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.423059e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.668015e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.425324e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.694387e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277293084707E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8158s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7828s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8048s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7728s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.7756s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4130s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3626s for 90112 events => throughput is 2.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7246s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3746s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3499s for 90112 events => throughput is 2.58E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.294705e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.280245e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.524485e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.525176e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.113307e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.116522e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.174133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.157499e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.119833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.119956e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.183136e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.172287e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.103258e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.122850e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.436179e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.440669e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 568f545851..b972c40fa5 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_20:09:47 +DATE: 2023-11-08_22:25:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -51,14 +51,552 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' -ERROR! ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' failed -d R # 5 > -0.0 -0.0 -0.0 0.4 0.4 -d R # 6 > -0.0 -0.0 -0.0 -0.0 0.4 -s min # 3> 0.0119716.0 29929.0 29929.0 0.0 -s min # 4> 0.0 0.0 29929.0 29929.0 0.0 -s min # 5> 0.0 0.0 0.0 0.0 0.0 -s min # 6> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 3> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 4> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 5> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 6> 0.0 0.0 0.0 0.0 0.0 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [UNWEIGHT] Wrote 1 events (found 166 events) + [COUNTERS] PROGRAM TOTAL : 95.8408s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4545s + [COUNTERS] Fortran MEs ( 1 ) : 95.3863s for 8192 events => throughput is 8.59E+01 events/s + +*** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 95.5040s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4512s + [COUNTERS] Fortran MEs ( 1 ) : 95.0528s for 8192 events => throughput is 8.62E+01 events/s + +*** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 1050.5151s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1583s + [COUNTERS] Fortran MEs ( 1 ) : 1046.3568s for 90112 events => throughput is 8.61E+01 events/s + +*** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 216.0448s + [COUNTERS] Fortran Overhead ( 0 ) : 99.5423s + [COUNTERS] CudaCpp MEs ( 2 ) : 116.5025s for 8192 events => throughput is 7.03E+01 events/s + +*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.4424906541753444e-15) + +*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813953E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 1395.0826s + [COUNTERS] Fortran Overhead ( 0 ) : 101.4573s + [COUNTERS] CudaCpp MEs ( 2 ) : 1293.6254s for 90112 events => throughput is 6.97E+01 events/s + +*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813953E-007) differ by less than 2E-14 (1.1102230246251565e-15) + +*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.294341e+01 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.275454e+01 ) sec^-1 + +*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 107.3938s + [COUNTERS] Fortran Overhead ( 0 ) : 49.4703s + [COUNTERS] CudaCpp MEs ( 2 ) : 57.9235s for 8192 events => throughput is 1.41E+02 events/s + +*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (2.220446049250313e-15) + +*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 689.6088s + [COUNTERS] Fortran Overhead ( 0 ) : 53.6676s + [COUNTERS] CudaCpp MEs ( 2 ) : 635.9412s for 90112 events => throughput is 1.42E+02 events/s + +*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) + +*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.663387e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.670748e+02 ) sec^-1 + +*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 50.5726s + [COUNTERS] Fortran Overhead ( 0 ) : 23.0971s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.4754s for 8192 events => throughput is 2.98E+02 events/s + +*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15) + +*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 326.9697s + [COUNTERS] Fortran Overhead ( 0 ) : 26.6301s + [COUNTERS] CudaCpp MEs ( 2 ) : 300.3396s for 90112 events => throughput is 3.00E+02 events/s + +*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) + +*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.612820e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.630261e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 44.4764s + [COUNTERS] Fortran Overhead ( 0 ) : 20.3120s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.1644s for 8192 events => throughput is 3.39E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 289.1902s + [COUNTERS] Fortran Overhead ( 0 ) : 23.9124s + [COUNTERS] CudaCpp MEs ( 2 ) : 265.2778s for 90112 events => throughput is 3.40E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.088132e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.127446e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 45.6965s + [COUNTERS] Fortran Overhead ( 0 ) : 22.1825s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.5139s for 8192 events => throughput is 3.48E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 283.5251s + [COUNTERS] Fortran Overhead ( 0 ) : 25.9112s + [COUNTERS] CudaCpp MEs ( 2 ) : 257.6139s for 90112 events => throughput is 3.50E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.741805e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.777930e+02 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435838E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 4.1875s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1069s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0806s for 8192 events => throughput is 7.58E+03 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435838E-006) differ by less than 2E-14 (3.1086244689504383e-15) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 18.7118s + [COUNTERS] Fortran Overhead ( 0 ) : 6.8168s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.8950s for 90112 events => throughput is 7.58E+03 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.523661e+03 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.283120e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.266218e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.591927e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.251570e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.476794e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.262349e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.252080e+03 ) sec^-1 + +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index e844ee5b79..3ca211fa85 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_20:09:50 +DATE: 2023-11-08_23:51:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -51,14 +51,552 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' -ERROR! ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' failed -d R # 5 > -0.0 -0.0 -0.0 0.4 0.4 -d R # 6 > -0.0 -0.0 -0.0 -0.0 0.4 -s min # 3> 0.0119716.0 29929.0 29929.0 0.0 -s min # 4> 0.0 0.0 29929.0 29929.0 0.0 -s min # 5> 0.0 0.0 0.0 0.0 0.0 -s min # 6> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 3> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 4> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 5> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 6> 0.0 0.0 0.0 0.0 0.0 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [UNWEIGHT] Wrote 1 events (found 166 events) + [COUNTERS] PROGRAM TOTAL : 95.6648s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4545s + [COUNTERS] Fortran MEs ( 1 ) : 95.2103s for 8192 events => throughput is 8.60E+01 events/s + +*** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 95.3879s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4521s + [COUNTERS] Fortran MEs ( 1 ) : 94.9358s for 8192 events => throughput is 8.63E+01 events/s + +*** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 1051.3512s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1998s + [COUNTERS] Fortran MEs ( 1 ) : 1047.1514s for 90112 events => throughput is 8.61E+01 events/s + +*** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1694768344939596E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 195.2840s + [COUNTERS] Fortran Overhead ( 0 ) : 89.6572s + [COUNTERS] CudaCpp MEs ( 2 ) : 105.6269s for 8192 events => throughput is 7.76E+01 events/s + +*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694768344939596E-006) differ by less than 4E-4 (0.00014259686216466783) + +*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1361436150871156E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 1253.2021s + [COUNTERS] Fortran Overhead ( 0 ) : 93.4786s + [COUNTERS] CudaCpp MEs ( 2 ) : 1159.7235s for 90112 events => throughput is 7.77E+01 events/s + +*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361436150871156E-007) differ by less than 4E-4 (0.00014045934987350073) + +*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.188520e+01 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.207566e+01 ) sec^-1 + +*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1694765850750953E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 48.9590s + [COUNTERS] Fortran Overhead ( 0 ) : 23.2330s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.7260s for 8192 events => throughput is 3.18E+02 events/s + +*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694765850750953E-006) differ by less than 4E-4 (0.00014238355787066226) + +*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1361430669586527E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 312.4727s + [COUNTERS] Fortran Overhead ( 0 ) : 26.8498s + [COUNTERS] CudaCpp MEs ( 2 ) : 285.6229s for 90112 events => throughput is 3.15E+02 events/s + +*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430669586527E-007) differ by less than 4E-4 (0.00014020271663550687) + +*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.595667e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.615224e+02 ) sec^-1 + +*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 25.4046s + [COUNTERS] Fortran Overhead ( 0 ) : 11.8022s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.6023s for 8192 events => throughput is 6.02E+02 events/s + +*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694764951124567E-006) differ by less than 4E-4 (0.00014230662135994443) + +*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 164.6743s + [COUNTERS] Fortran Overhead ( 0 ) : 15.5764s + [COUNTERS] CudaCpp MEs ( 2 ) : 149.0979s for 90112 events => throughput is 6.04E+02 events/s + +*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430425531218E-007) differ by less than 4E-4 (0.0001401912899885449) + +*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.233727e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.144603e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 22.4388s + [COUNTERS] Fortran Overhead ( 0 ) : 10.5095s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9293s for 8192 events => throughput is 6.87E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694764951124567E-006) differ by less than 4E-4 (0.00014230662135994443) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 145.9227s + [COUNTERS] Fortran Overhead ( 0 ) : 13.9719s + [COUNTERS] CudaCpp MEs ( 2 ) : 131.9508s for 90112 events => throughput is 6.83E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430425531218E-007) differ by less than 4E-4 (0.0001401912899885449) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.277686e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.316223e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1694767957195604E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 22.8899s + [COUNTERS] Fortran Overhead ( 0 ) : 11.3435s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.5464s for 8192 events => throughput is 7.09E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694767957195604E-006) differ by less than 4E-4 (0.00014256370209930758) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1361435956349820E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 142.7065s + [COUNTERS] Fortran Overhead ( 0 ) : 14.9424s + [COUNTERS] CudaCpp MEs ( 2 ) : 127.7641s for 90112 events => throughput is 7.05E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435956349820E-007) differ by less than 4E-4 (0.00014045024240250115) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.537880e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.497574e+02 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1694770708195000E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 2.4801s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9879s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4922s for 8192 events => throughput is 1.66E+04 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694770708195000E-006) differ by less than 4E-4 (0.00014279896898083955) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1361443477565659E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 11.0377s + [COUNTERS] Fortran Overhead ( 0 ) : 5.5836s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4541s for 90112 events => throughput is 1.65E+04 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361443477565659E-007) differ by less than 4E-4 (0.0001408023850304474) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.639292e+04 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.626171e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.329585e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.369301e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.304460e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.376586e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.333260e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.421151e+03 ) sec^-1 + +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 43bf5072f2..2729351c42 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_20:09:53 +DATE: 2023-11-09_00:57:00 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -51,14 +51,552 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' -ERROR! ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' failed -d R # 5 > -0.0 -0.0 -0.0 0.4 0.4 -d R # 6 > -0.0 -0.0 -0.0 -0.0 0.4 -s min # 3> 0.0119716.0 29929.0 29929.0 0.0 -s min # 4> 0.0 0.0 29929.0 29929.0 0.0 -s min # 5> 0.0 0.0 0.0 0.0 0.0 -s min # 6> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 3> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 4> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 5> 0.0 0.0 0.0 0.0 0.0 -xqcutij # 6> 0.0 0.0 0.0 0.0 0.0 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [UNWEIGHT] Wrote 1 events (found 166 events) + [COUNTERS] PROGRAM TOTAL : 95.3917s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4564s + [COUNTERS] Fortran MEs ( 1 ) : 94.9352s for 8192 events => throughput is 8.63E+01 events/s + +*** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 95.2404s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4495s + [COUNTERS] Fortran MEs ( 1 ) : 94.7909s for 8192 events => throughput is 8.64E+01 events/s + +*** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 1049.6483s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1482s + [COUNTERS] Fortran MEs ( 1 ) : 1045.5001s for 90112 events => throughput is 8.62E+01 events/s + +*** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693101016896846E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 223.2377s + [COUNTERS] Fortran Overhead ( 0 ) : 102.8564s + [COUNTERS] CudaCpp MEs ( 2 ) : 120.3813s for 8192 events => throughput is 6.81E+01 events/s + +*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101016896846E-006) differ by less than 2E-4 (6.111385175699979e-09) + +*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 1425.5713s + [COUNTERS] Fortran Overhead ( 0 ) : 106.5194s + [COUNTERS] CudaCpp MEs ( 2 ) : 1319.0519s for 90112 events => throughput is 6.83E+01 events/s + +*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436275882778E-007) differ by less than 2E-4 (5.48115042242614e-09) + +*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.033155e+01 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.028364e+01 ) sec^-1 + +*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 110.1179s + [COUNTERS] Fortran Overhead ( 0 ) : 50.7873s + [COUNTERS] CudaCpp MEs ( 2 ) : 59.3305s for 8192 events => throughput is 1.38E+02 events/s + +*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101020910778E-006) differ by less than 2E-4 (6.454658807442115e-09) + +*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436284111598E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 704.2691s + [COUNTERS] Fortran Overhead ( 0 ) : 54.2949s + [COUNTERS] CudaCpp MEs ( 2 ) : 649.9742s for 90112 events => throughput is 1.39E+02 events/s + +*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436284111598E-007) differ by less than 2E-4 (5.866422903011426e-09) + +*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.635297e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.628042e+02 ) sec^-1 + +*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 48.2204s + [COUNTERS] Fortran Overhead ( 0 ) : 21.9374s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2831s for 8192 events => throughput is 3.12E+02 events/s + +*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09) + +*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 314.3646s + [COUNTERS] Fortran Overhead ( 0 ) : 26.1162s + [COUNTERS] CudaCpp MEs ( 2 ) : 288.2484s for 90112 events => throughput is 3.13E+02 events/s + +*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09) + +*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.810528e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.825565e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 42.6054s + [COUNTERS] Fortran Overhead ( 0 ) : 19.4149s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.1905s for 8192 events => throughput is 3.53E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 278.0352s + [COUNTERS] Fortran Overhead ( 0 ) : 23.0285s + [COUNTERS] CudaCpp MEs ( 2 ) : 255.0067s for 90112 events => throughput is 3.53E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.372569e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.390556e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 44.8365s + [COUNTERS] Fortran Overhead ( 0 ) : 21.9299s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.9066s for 8192 events => throughput is 3.58E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 280.1799s + [COUNTERS] Fortran Overhead ( 0 ) : 25.4637s + [COUNTERS] CudaCpp MEs ( 2 ) : 254.7162s for 90112 events => throughput is 3.54E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.829822e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.840554e+02 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 3.5385s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6761s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8624s for 8192 events => throughput is 9.50E+03 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.2792201459509442e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 15.7972s + [COUNTERS] Fortran Overhead ( 0 ) : 6.3222s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4751s for 90112 events => throughput is 9.51E+03 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173705990875078e-11) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.416746e+03 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.082101e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111361e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.159067e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.107992e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.110248e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.116277e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.631653e+03 ) sec^-1 + +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 2a2ae334de..a53e3fae12 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'make[1]: Nothing to be done for 'all'. + make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_20:08:20 +DATE: 2023-11-08_22:24:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3085s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2380s - [COUNTERS] Fortran MEs ( 1 ) : 0.0705s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3033s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2340s + [COUNTERS] Fortran MEs ( 1 ) : 0.0693s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3042s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2333s - [COUNTERS] Fortran MEs ( 1 ) : 0.0708s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3022s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2323s + [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2114s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4363s - [COUNTERS] Fortran MEs ( 1 ) : 0.7751s for 90112 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1700s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4093s + [COUNTERS] Fortran MEs ( 1 ) : 0.7607s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3922s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3158s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0765s for 8192 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3087s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0756s for 8192 events => throughput is 1.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.3858s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5438s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8420s for 90112 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3194s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4974s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8220s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.080426e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.094809e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.086485e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.102064e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3230s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2818s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3132s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2728s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0404s for 8192 events => throughput is 2.03E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9553s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4982s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4571s for 90112 events => throughput is 1.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9124s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4682s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4441s for 90112 events => throughput is 2.03E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.984398e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.028339e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.942353e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.046734e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2839s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2604s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0236s for 8192 events => throughput is 3.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2792s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2558s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.51E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7384s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4779s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2605s for 90112 events => throughput is 3.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6989s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4448s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2540s for 90112 events => throughput is 3.55E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.360936e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.552356e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.508331e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.523608e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2789s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2578s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0211s for 8192 events => throughput is 3.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2748s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2536s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.86E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7099s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4763s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2336s for 90112 events => throughput is 3.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6795s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4499s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2296s for 90112 events => throughput is 3.93E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.911581e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.842884e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.775740e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.986906e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3043s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2722s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2949s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2643s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0307s for 8192 events => throughput is 2.67E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8369s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4881s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3488s for 90112 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8124s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3444s for 90112 events => throughput is 2.62E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.489296e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.637628e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.512748e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.616200e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6694s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6687s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6561s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6555s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9131s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9051s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0080s for 90112 events => throughput is 1.13E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8543s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8466s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.18E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.578046e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.567103e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.918680e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.093360e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.385541e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.536245e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.515910e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.495821e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.366310e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.517486e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.781318e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.749421e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.383694e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.528020e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.778819e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.773747e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 76ba714558..8d2e1984e4 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_20:08:49 +DATE: 2023-11-08_22:24:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3082s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2374s - [COUNTERS] Fortran MEs ( 1 ) : 0.0709s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3107s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2407s + [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3089s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2376s - [COUNTERS] Fortran MEs ( 1 ) : 0.0713s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2965s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2276s + [COUNTERS] Fortran MEs ( 1 ) : 0.0689s for 8192 events => throughput is 1.19E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2176s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4416s - [COUNTERS] Fortran MEs ( 1 ) : 0.7760s for 90112 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1583s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4006s + [COUNTERS] Fortran MEs ( 1 ) : 0.7577s for 90112 events => throughput is 1.19E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050316058770007] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3831s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3106s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0725s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3794s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3064s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0730s for 8192 events => throughput is 1.12E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182797520666] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.3282s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5337s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7945s for 90112 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5714s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7649s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8065s for 90112 events => throughput is 1.12E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.150985e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.157942e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.152855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.172513e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050313133963987] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2893s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2630s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0263s for 8192 events => throughput is 3.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2568s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801179276862181] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7627s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4776s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2851s for 90112 events => throughput is 3.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7271s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4513s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2758s for 90112 events => throughput is 3.27E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.058447e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.237957e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.117460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.272249e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2617s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2490s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0127s for 8192 events => throughput is 6.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2581s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2459s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.75E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6264s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4861s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1403s for 90112 events => throughput is 6.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5730s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4380s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1350s for 90112 events => throughput is 6.67E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.320941e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.530818e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.304004e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.313362e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2672s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2557s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0115s for 8192 events => throughput is 7.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2561s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2447s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0114s for 8192 events => throughput is 7.19E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6213s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4898s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1315s for 90112 events => throughput is 6.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5576s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4335s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1240s for 90112 events => throughput is 7.27E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.800881e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.360354e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.852775e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.523552e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2707s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2550s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0157s for 8192 events => throughput is 5.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2804s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2629s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.66E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6597s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4813s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1784s for 90112 events => throughput is 5.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6820s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4993s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1827s for 90112 events => throughput is 4.93E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.682841e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.733153e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.814031e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.992885e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6668s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6663s + [COUNTERS] PROGRAM TOTAL : 0.6547s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6542s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.60E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9031s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8970s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 90112 events => throughput is 1.46E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8561s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8501s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.53E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.810157e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.584146e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.442986e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.491850e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.776377e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.856033e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.714442e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.715106e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.784654e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.884678e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.791545e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799322e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.353442e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.441795e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.984091e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.896004e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index d9f19e3972..19ad35f402 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-03_20:09:17 +DATE: 2023-11-08_22:25:01 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3076s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2365s - [COUNTERS] Fortran MEs ( 1 ) : 0.0711s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3005s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2310s + [COUNTERS] Fortran MEs ( 1 ) : 0.0695s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3048s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2341s - [COUNTERS] Fortran MEs ( 1 ) : 0.0707s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3006s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2306s + [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2173s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4390s - [COUNTERS] Fortran MEs ( 1 ) : 0.7783s for 90112 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1678s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4091s + [COUNTERS] Fortran MEs ( 1 ) : 0.7587s for 90112 events => throughput is 1.19E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3915s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3150s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0766s for 8192 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3817s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3071s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0746s for 8192 events => throughput is 1.10E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182636608796] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4080s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5555s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8525s for 90112 events => throughput is 1.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3333s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8245s for 90112 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.026153e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.076052e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.029864e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.093800e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657201] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3183s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2784s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0399s for 8192 events => throughput is 2.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3088s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2701s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0388s for 8192 events => throughput is 2.11E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182636608810] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9529s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5041s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4488s for 90112 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8975s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4677s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4298s for 90112 events => throughput is 2.10E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.013366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.015345e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.020889e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.988560e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2872s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2636s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 8192 events => throughput is 3.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2819s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2586s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 8192 events => throughput is 3.52E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7508s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4909s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2599s for 90112 events => throughput is 3.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7744s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5086s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2658s for 90112 events => throughput is 3.39E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.380135e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.485253e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.471740e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534003e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2817s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2612s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0206s for 8192 events => throughput is 3.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2862s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2651s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0211s for 8192 events => throughput is 3.88E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7107s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4801s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2306s for 90112 events => throughput is 3.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6779s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4535s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2245s for 90112 events => throughput is 4.01E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.890792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.974625e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.973788e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.057698e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3050s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0339s for 8192 events => throughput is 2.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2985s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2664s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8573s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4973s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3600s for 90112 events => throughput is 2.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8085s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4572s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3513s for 90112 events => throughput is 2.56E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.438047e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.330681e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.395865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.533534e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333301029693] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6664s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6572s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6565s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182637219935] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8923s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8845s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.16E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8718s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8641s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.584492e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.553454e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.972938e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.988956e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.377134e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.533250e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.496287e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.514727e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.388325e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.523754e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.763560e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.800142e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.382255e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.530148e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.773123e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.776434e+07 ) sec^-1 TEST COMPLETED From 7ae4e0460964de0b5e1951c9e8338d75f4ede8c3 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 10:21:17 +0100 Subject: [PATCH 07/14] [actions/gpucpp] reenable testsuite check in the CI This completed my first version of the gpucpp PR, but I later included also the 3.5.2 upgrade Revert "[actions/gpucpp] TEMPORARILY disable testsuite on PRs (gh extension install actions/gh-actions-cache gives 'HTTP 403: API rate limit exceeded')" This reverts commit 1fd1c4c5f493c21c3b271f980571db21c604bc7c. --- .github/workflows/testsuite_allprocesses.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/testsuite_allprocesses.yml b/.github/workflows/testsuite_allprocesses.yml index 662284f944..7eaad09c9f 100644 --- a/.github/workflows/testsuite_allprocesses.yml +++ b/.github/workflows/testsuite_allprocesses.yml @@ -15,9 +15,8 @@ on: workflow_dispatch: # Trigger the all-processes workflow for pull requests to master - # TEMPORARILY disable these tests on PRs (gh extension install actions/gh-actions-cache gives 'HTTP 403: API rate limit exceeded') - ###pull_request: - ### branches: [ master ] + pull_request: + branches: [ master ] # Trigger the all-processes workflow when new changes to the workflow are pushed push: From 3ee024de3f3181d3ceae1d6231c35ad08c2a4f24 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 16:56:55 +0100 Subject: [PATCH 08/14] [gpucpp] include Olivier's latest mg5amcnlo changes (merged from 3.5.2) --- MG5aMC/mg5amcnlo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index d7a466dd54..d8c1613ccf 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit d7a466dd54bb2f57564f5cc674f129ebf095c969 +Subproject commit d8c1613ccf638b5b078a64379e385def5649622c From 05dd4b2f25f960d3c60f83801b5d1b3c65fce95f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 10:51:05 +0100 Subject: [PATCH 09/14] [gpucpp] in CODEGEN launch_plugin.py and output.py, improve python formatting (cosmetics only) --- .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 4 ++-- .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index c9d1c7706a..0b849330ef 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name): return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - + def plugin_input(self, finput): return @@ -79,7 +79,7 @@ def check_validity(self): self['sde_strategy'] = 1 if self['hel_recycling']: self['hel_recycling'] = False - + class GPURunCard(CPPRunCard): def default_setup(self): super(CPPRunCard, self).default_setup() diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index e3f88719f2..fb33465a03 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -203,9 +203,9 @@ def convert_model(self, model, wanted_lorentz=[], wanted_coupling=[]): # AV (default from OM's tutorial) - add a debug printout def finalize(self, matrix_element, cmdhistory, MG5options, outputflag): """Typically creating jpeg/HTML output/ compilation/... - cmdhistory is the list of command used so far. - MG5options are all the options of the main interface - outputflags is a list of options provided when doing the output command""" + cmdhistory is the list of command used so far. + MG5options are all the options of the main interface + outputflags is a list of options provided when doing the output command""" misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self)) if self.in_madevent_mode: self.add_input_for_banner() @@ -217,7 +217,7 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag): #if os.system(path + os.sep + 'patchMad.sh ' + self.dir_path + ' PROD ' + patchlevel) != 0: # logger.debug("####### \n stdout is \n %s", stdout) # logger.info("####### \n stderr is \n %s", stderr) - # raise Exception('ERROR! the O/S call to patchMad.sh failed') + # raise Exception('ERROR! the O/S call to patchMad.sh failed') # OLD implementation (SH PR #762) #if os.system(PLUGINDIR + os.sep + 'patchMad.sh ' + self.dir_path + ' PROD ' + patchlevel) != 0: # logger.debug("####### \n stdout is \n %s", stdout) @@ -270,7 +270,7 @@ def add_madevent_plugin_fct(self): which contains a series of functions and one dictionary variable TO_OVERWRITE that will be used to have temporary overwrite of all the key variable passed as string by their value. all variable that are file related should be called as madgraph.dir.file.variable - """ + """ plugin_path = os.path.dirname(os.path.realpath( __file__ )) files.cp(pjoin(plugin_path, 'launch_plugin.py'), pjoin(self.dir_path, 'bin', 'internal')) files.ln(pjoin(self.dir_path, 'lib'), pjoin(self.dir_path, 'SubProcesses')) @@ -286,10 +286,10 @@ def change_output_args(args, cmd): if 'vector_size' not in ''.join(args): args.append('--vector_size=16') return args - + #------------------------------------------------------------------------------------ -class GPU_ProcessExporter(PLUGIN_ProcessExporter): +class GPU_ProcessExporter(PLUGIN_ProcessExporter): def change_output_args(args, cmd): """ """ cmd._export_format = "madevent" @@ -298,7 +298,7 @@ def change_output_args(args, cmd): if 'vector_size' not in ''.join(args): args.append('--vector_size=16384') return args - + def finalize(self, matrix_element, cmdhistory, MG5options, outputflag): misc.sprint("enter dedicated function") out = super().finalize(matrix_element, cmdhistory, MG5options, outputflag) From 641754e2d3a5f2a4386d7740d44420dad6c2d6f8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 17:14:51 +0100 Subject: [PATCH 10/14] [gpucpp] in CODEGEN __init__.py, mark version 3.5.2 as validated minimum version --- epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py index a0cd9dbfb3..82661c6c66 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py @@ -55,6 +55,6 @@ __author__ = 'Andrea Valassi' __email__ = 'andrea.valassi@cern.ch' __version__ = (1,0,0) - minimal_mg5amcnlo_version = (3,5,1) + minimal_mg5amcnlo_version = (3,5,2) maximal_mg5amcnlo_version = (1000,1000,1000) - latest_validated_version = (3,5,1) + latest_validated_version = (3,5,2) From aae8ef15559e794283328c341b33e4703afc7642 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 17:16:38 +0100 Subject: [PATCH 11/14] [gpucpp] in CODEGEN output.py, remove run_card_class again (Olivier has made this unnecessary in 3.5.2) Revert "[gpucpp] in CODEGEN output.py, add run_card_class to avoid crashes after Olivier's commit 8a18cc242 "better handling of the run_card"" This reverts commit 8c654cf0d35c332e3f4449301f8a8758cc3efce5. --- epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index fb33465a03..5b557e832a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -149,9 +149,6 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): ###helas_exporter = None helas_exporter = model_handling.PLUGIN_GPUFOHelasCallWriter # this is one of the main fixes for issue #341! - # AV 08 Nov 2023 add run_card_class to avoid crashes after Olivier's commit 8a18cc242 "better handling of the run_card" - run_card_class = None - # AV (default from OM's tutorial) - add a debug printout def __init__(self, *args, **kwargs): self.in_madevent_mode = False # see MR #747 From 56308e937172b6d66f0bbcf1f9ecb581794cebeb Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 17:22:56 +0100 Subject: [PATCH 12/14] [gpucpp] regenerate all 15 processes after Olivier's latest upstream changes, merging to 3.5.2 Most changes are in the version comments (from 3.5.1 to 3.5.2) There are also some minor changes in genps.f but they look like bug fixes (nincming instead of hardcoded 2) --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 31 ++++---- .../ee_mumu.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt | 2 +- .../ee_mumu.mad/SubProcesses/MGVersion.txt | 2 +- .../SubProcesses/P1_epem_mupmum/CPPProcess.cc | 2 +- .../SubProcesses/P1_epem_mupmum/CPPProcess.h | 2 +- .../SubProcesses/P1_epem_mupmum/auto_dsig.f | 2 +- .../SubProcesses/P1_epem_mupmum/auto_dsig1.f | 4 +- .../SubProcesses/P1_epem_mupmum/matrix1.f | 4 +- .../cudacpp/ee_mumu.mad/SubProcesses/genps.f | 4 +- .../ee_mumu.mad/bin/internal/__init__.py | 1 + .../ee_mumu.mad/bin/internal/banner.py | 5 +- .../bin/internal/common_run_interface.py | 17 ++++- .../ee_mumu.mad/bin/internal/gen_ximprove.py | 18 +++-- .../ee_mumu.mad/bin/internal/launch_plugin.py | 4 +- .../bin/internal/madevent_interface.py | 25 ++++--- .../cudacpp/ee_mumu.mad/bin/internal/misc.py | 2 +- .../ee_mumu.mad/bin/internal/shower_card.py | 10 ++- epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h | 2 +- .../cudacpp/ee_mumu.mad/src/Parameters_sm.cc | 2 +- .../cudacpp/ee_mumu.mad/src/Parameters_sm.h | 2 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 30 ++++---- .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc | 2 +- .../P1_Sigma_sm_epem_mupmum/CPPProcess.h | 2 +- epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h | 2 +- .../cudacpp/ee_mumu.sa/src/Parameters_sm.cc | 2 +- epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h | 2 +- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 35 ++++----- .../cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/gg_tt.mad/MGMEVersion.txt | 2 +- .../gg_tt.mad/SubProcesses/MGVersion.txt | 2 +- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 2 +- .../SubProcesses/P1_gg_ttx/CPPProcess.h | 2 +- .../SubProcesses/P1_gg_ttx/auto_dsig.f | 2 +- .../SubProcesses/P1_gg_ttx/auto_dsig1.f | 4 +- .../SubProcesses/P1_gg_ttx/matrix1.f | 4 +- epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f | 4 +- .../gg_tt.mad/bin/internal/__init__.py | 1 + .../cudacpp/gg_tt.mad/bin/internal/banner.py | 5 +- .../bin/internal/common_run_interface.py | 17 ++++- .../gg_tt.mad/bin/internal/gen_ximprove.py | 18 +++-- .../gg_tt.mad/bin/internal/launch_plugin.py | 4 +- .../bin/internal/madevent_interface.py | 25 ++++--- epochX/cudacpp/gg_tt.mad/bin/internal/misc.py | 2 +- .../gg_tt.mad/bin/internal/shower_card.py | 10 ++- epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h | 2 +- epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc | 2 +- epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h | 2 +- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 28 +++---- .../P1_Sigma_sm_gg_ttx/CPPProcess.cc | 2 +- .../P1_Sigma_sm_gg_ttx/CPPProcess.h | 2 +- epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h | 2 +- epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc | 2 +- epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h | 2 +- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 39 +++++----- .../gg_tt01g.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt | 2 +- .../gg_tt01g.mad/SubProcesses/MGVersion.txt | 2 +- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 2 +- .../SubProcesses/P1_gg_ttx/CPPProcess.h | 2 +- .../SubProcesses/P1_gg_ttx/auto_dsig.f | 2 +- .../SubProcesses/P1_gg_ttx/auto_dsig1.f | 4 +- .../SubProcesses/P1_gg_ttx/matrix1.f | 4 +- .../SubProcesses/P2_gg_ttxg/CPPProcess.cc | 2 +- .../SubProcesses/P2_gg_ttxg/CPPProcess.h | 2 +- .../SubProcesses/P2_gg_ttxg/auto_dsig.f | 2 +- .../SubProcesses/P2_gg_ttxg/auto_dsig1.f | 4 +- .../SubProcesses/P2_gg_ttxg/matrix1.f | 4 +- .../cudacpp/gg_tt01g.mad/SubProcesses/genps.f | 4 +- .../gg_tt01g.mad/bin/internal/__init__.py | 1 + .../gg_tt01g.mad/bin/internal/banner.py | 5 +- .../bin/internal/common_run_interface.py | 17 ++++- .../gg_tt01g.mad/bin/internal/gen_ximprove.py | 18 +++-- .../bin/internal/launch_plugin.py | 4 +- .../bin/internal/madevent_interface.py | 25 ++++--- .../cudacpp/gg_tt01g.mad/bin/internal/misc.py | 2 +- .../gg_tt01g.mad/bin/internal/shower_card.py | 10 ++- epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h | 2 +- .../cudacpp/gg_tt01g.mad/src/Parameters_sm.cc | 2 +- .../cudacpp/gg_tt01g.mad/src/Parameters_sm.h | 2 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 37 +++++----- .../gg_ttg.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt | 2 +- .../gg_ttg.mad/SubProcesses/MGVersion.txt | 2 +- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 2 +- .../SubProcesses/P1_gg_ttxg/CPPProcess.h | 2 +- .../SubProcesses/P1_gg_ttxg/auto_dsig.f | 2 +- .../SubProcesses/P1_gg_ttxg/auto_dsig1.f | 4 +- .../SubProcesses/P1_gg_ttxg/matrix1.f | 4 +- .../cudacpp/gg_ttg.mad/SubProcesses/genps.f | 4 +- .../gg_ttg.mad/bin/internal/__init__.py | 1 + .../cudacpp/gg_ttg.mad/bin/internal/banner.py | 5 +- .../bin/internal/common_run_interface.py | 17 ++++- .../gg_ttg.mad/bin/internal/gen_ximprove.py | 18 +++-- .../gg_ttg.mad/bin/internal/launch_plugin.py | 4 +- .../bin/internal/madevent_interface.py | 25 ++++--- .../cudacpp/gg_ttg.mad/bin/internal/misc.py | 2 +- .../gg_ttg.mad/bin/internal/shower_card.py | 10 ++- epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h | 2 +- .../cudacpp/gg_ttg.mad/src/Parameters_sm.cc | 2 +- epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h | 2 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 28 +++---- .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc | 2 +- .../P1_Sigma_sm_gg_ttxg/CPPProcess.h | 2 +- epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h | 2 +- epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc | 2 +- epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h | 2 +- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 37 +++++----- .../gg_ttgg.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt | 2 +- .../gg_ttgg.mad/SubProcesses/MGVersion.txt | 2 +- .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc | 2 +- .../SubProcesses/P1_gg_ttxgg/CPPProcess.h | 2 +- .../SubProcesses/P1_gg_ttxgg/auto_dsig.f | 2 +- .../SubProcesses/P1_gg_ttxgg/auto_dsig1.f | 4 +- .../SubProcesses/P1_gg_ttxgg/matrix1.f | 4 +- .../cudacpp/gg_ttgg.mad/SubProcesses/genps.f | 4 +- .../gg_ttgg.mad/bin/internal/__init__.py | 1 + .../gg_ttgg.mad/bin/internal/banner.py | 5 +- .../bin/internal/common_run_interface.py | 17 ++++- .../gg_ttgg.mad/bin/internal/gen_ximprove.py | 18 +++-- .../gg_ttgg.mad/bin/internal/launch_plugin.py | 4 +- .../bin/internal/madevent_interface.py | 25 ++++--- .../cudacpp/gg_ttgg.mad/bin/internal/misc.py | 2 +- .../gg_ttgg.mad/bin/internal/shower_card.py | 10 ++- epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h | 2 +- .../cudacpp/gg_ttgg.mad/src/Parameters_sm.cc | 2 +- .../cudacpp/gg_ttgg.mad/src/Parameters_sm.h | 2 +- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 32 ++++---- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc | 2 +- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.h | 2 +- epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h | 2 +- .../cudacpp/gg_ttgg.sa/src/Parameters_sm.cc | 2 +- epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h | 2 +- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 37 +++++----- .../gg_ttggg.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt | 2 +- .../gg_ttggg.mad/SubProcesses/MGVersion.txt | 2 +- .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 2 +- .../SubProcesses/P1_gg_ttxggg/CPPProcess.h | 2 +- .../SubProcesses/P1_gg_ttxggg/auto_dsig.f | 2 +- .../SubProcesses/P1_gg_ttxggg/auto_dsig1.f | 4 +- .../SubProcesses/P1_gg_ttxggg/matrix1.f | 4 +- .../cudacpp/gg_ttggg.mad/SubProcesses/genps.f | 4 +- .../gg_ttggg.mad/bin/internal/__init__.py | 1 + .../gg_ttggg.mad/bin/internal/banner.py | 5 +- .../bin/internal/common_run_interface.py | 17 ++++- .../gg_ttggg.mad/bin/internal/gen_ximprove.py | 18 +++-- .../bin/internal/launch_plugin.py | 4 +- .../bin/internal/madevent_interface.py | 25 ++++--- .../cudacpp/gg_ttggg.mad/bin/internal/misc.py | 2 +- .../gg_ttggg.mad/bin/internal/shower_card.py | 10 ++- epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h | 2 +- .../cudacpp/gg_ttggg.mad/src/Parameters_sm.cc | 2 +- .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h | 2 +- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 30 ++++---- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc | 2 +- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.h | 2 +- epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h | 2 +- .../cudacpp/gg_ttggg.sa/src/Parameters_sm.cc | 2 +- .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h | 2 +- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 35 ++++----- .../gq_ttq.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt | 2 +- .../gq_ttq.mad/SubProcesses/MGVersion.txt | 2 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 2 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.h | 2 +- .../SubProcesses/P1_gu_ttxu/auto_dsig.f | 2 +- .../SubProcesses/P1_gu_ttxu/auto_dsig1.f | 4 +- .../SubProcesses/P1_gu_ttxu/matrix1.f | 4 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 2 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.h | 2 +- .../SubProcesses/P1_gux_ttxux/auto_dsig.f | 2 +- .../SubProcesses/P1_gux_ttxux/auto_dsig1.f | 4 +- .../SubProcesses/P1_gux_ttxux/matrix1.f | 4 +- .../cudacpp/gq_ttq.mad/SubProcesses/genps.f | 4 +- .../gq_ttq.mad/bin/internal/__init__.py | 1 + .../cudacpp/gq_ttq.mad/bin/internal/banner.py | 5 +- .../bin/internal/common_run_interface.py | 17 ++++- .../gq_ttq.mad/bin/internal/gen_ximprove.py | 18 +++-- .../gq_ttq.mad/bin/internal/launch_plugin.py | 4 +- .../bin/internal/madevent_interface.py | 25 ++++--- .../cudacpp/gq_ttq.mad/bin/internal/misc.py | 2 +- .../gq_ttq.mad/bin/internal/shower_card.py | 10 ++- epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h | 2 +- .../cudacpp/gq_ttq.mad/src/Parameters_sm.cc | 2 +- epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h | 2 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 40 +++++----- .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc | 2 +- .../P1_Sigma_sm_gu_ttxu/CPPProcess.h | 2 +- .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc | 2 +- .../P1_Sigma_sm_gux_ttxux/CPPProcess.h | 2 +- epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h | 2 +- epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc | 2 +- epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h | 2 +- .../CODEGEN_cudacpp_heft_gg_h_log.txt | 22 +++--- .../P1_Sigma_heft_gg_h/CPPProcess.cc | 2 +- .../P1_Sigma_heft_gg_h/CPPProcess.h | 2 +- .../cudacpp/heft_gg_h.sa/src/HelAmps_heft.h | 2 +- .../heft_gg_h.sa/src/Parameters_heft.cc | 2 +- .../heft_gg_h.sa/src/Parameters_heft.h | 2 +- .../CODEGEN_mad_pp_tt012j_log.txt | 73 ++++++++++--------- .../pp_tt012j.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt | 2 +- .../pp_tt012j.mad/SubProcesses/MGVersion.txt | 2 +- .../SubProcesses/P0_gg_ttx/CPPProcess.cc | 2 +- .../SubProcesses/P0_gg_ttx/CPPProcess.h | 2 +- .../SubProcesses/P0_gg_ttx/auto_dsig.f | 2 +- .../SubProcesses/P0_gg_ttx/auto_dsig1.f | 4 +- .../SubProcesses/P0_gg_ttx/matrix1.f | 4 +- .../SubProcesses/P0_uux_ttx/CPPProcess.cc | 2 +- .../SubProcesses/P0_uux_ttx/CPPProcess.h | 2 +- .../SubProcesses/P0_uux_ttx/auto_dsig.f | 2 +- .../SubProcesses/P0_uux_ttx/auto_dsig1.f | 4 +- .../SubProcesses/P0_uux_ttx/matrix1.f | 4 +- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 2 +- .../SubProcesses/P1_gg_ttxg/CPPProcess.h | 2 +- .../SubProcesses/P1_gg_ttxg/auto_dsig.f | 2 +- .../SubProcesses/P1_gg_ttxg/auto_dsig1.f | 4 +- .../SubProcesses/P1_gg_ttxg/matrix1.f | 4 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 2 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.h | 2 +- .../SubProcesses/P1_gu_ttxu/auto_dsig.f | 2 +- .../SubProcesses/P1_gu_ttxu/auto_dsig1.f | 4 +- .../SubProcesses/P1_gu_ttxu/matrix1.f | 4 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 2 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.h | 2 +- .../SubProcesses/P1_gux_ttxux/auto_dsig.f | 2 +- .../SubProcesses/P1_gux_ttxux/auto_dsig1.f | 4 +- .../SubProcesses/P1_gux_ttxux/matrix1.f | 4 +- .../SubProcesses/P1_uux_ttxg/CPPProcess.cc | 2 +- .../SubProcesses/P1_uux_ttxg/CPPProcess.h | 2 +- .../SubProcesses/P1_uux_ttxg/auto_dsig.f | 2 +- .../SubProcesses/P1_uux_ttxg/auto_dsig1.f | 4 +- .../SubProcesses/P1_uux_ttxg/matrix1.f | 4 +- .../SubProcesses/P2_gg_ttxgg/CPPProcess.cc | 2 +- .../SubProcesses/P2_gg_ttxgg/CPPProcess.h | 2 +- .../SubProcesses/P2_gg_ttxgg/auto_dsig.f | 2 +- .../SubProcesses/P2_gg_ttxgg/auto_dsig1.f | 4 +- .../SubProcesses/P2_gg_ttxgg/matrix1.f | 4 +- .../SubProcesses/P2_gg_ttxuux/CPPProcess.cc | 2 +- .../SubProcesses/P2_gg_ttxuux/CPPProcess.h | 2 +- .../SubProcesses/P2_gg_ttxuux/auto_dsig.f | 2 +- .../SubProcesses/P2_gg_ttxuux/auto_dsig1.f | 4 +- .../SubProcesses/P2_gg_ttxuux/matrix1.f | 4 +- .../SubProcesses/P2_gu_ttxgu/CPPProcess.cc | 2 +- .../SubProcesses/P2_gu_ttxgu/CPPProcess.h | 2 +- .../SubProcesses/P2_gu_ttxgu/auto_dsig.f | 2 +- .../SubProcesses/P2_gu_ttxgu/auto_dsig1.f | 4 +- .../SubProcesses/P2_gu_ttxgu/matrix1.f | 4 +- .../SubProcesses/P2_gux_ttxgux/CPPProcess.cc | 2 +- .../SubProcesses/P2_gux_ttxgux/CPPProcess.h | 2 +- .../SubProcesses/P2_gux_ttxgux/auto_dsig.f | 2 +- .../SubProcesses/P2_gux_ttxgux/auto_dsig1.f | 4 +- .../SubProcesses/P2_gux_ttxgux/matrix1.f | 4 +- .../SubProcesses/P2_uc_ttxuc/CPPProcess.cc | 2 +- .../SubProcesses/P2_uc_ttxuc/CPPProcess.h | 2 +- .../SubProcesses/P2_uc_ttxuc/auto_dsig.f | 2 +- .../SubProcesses/P2_uc_ttxuc/auto_dsig1.f | 4 +- .../SubProcesses/P2_uc_ttxuc/matrix1.f | 4 +- .../SubProcesses/P2_ucx_ttxucx/CPPProcess.cc | 2 +- .../SubProcesses/P2_ucx_ttxucx/CPPProcess.h | 2 +- .../SubProcesses/P2_ucx_ttxucx/auto_dsig.f | 2 +- .../SubProcesses/P2_ucx_ttxucx/auto_dsig1.f | 4 +- .../SubProcesses/P2_ucx_ttxucx/matrix1.f | 4 +- .../SubProcesses/P2_uu_ttxuu/CPPProcess.cc | 2 +- .../SubProcesses/P2_uu_ttxuu/CPPProcess.h | 2 +- .../SubProcesses/P2_uu_ttxuu/auto_dsig.f | 2 +- .../SubProcesses/P2_uu_ttxuu/auto_dsig1.f | 4 +- .../SubProcesses/P2_uu_ttxuu/matrix1.f | 4 +- .../SubProcesses/P2_uux_ttxccx/CPPProcess.cc | 2 +- .../SubProcesses/P2_uux_ttxccx/CPPProcess.h | 2 +- .../SubProcesses/P2_uux_ttxccx/auto_dsig.f | 2 +- .../SubProcesses/P2_uux_ttxccx/auto_dsig1.f | 4 +- .../SubProcesses/P2_uux_ttxccx/matrix1.f | 4 +- .../SubProcesses/P2_uux_ttxgg/CPPProcess.cc | 2 +- .../SubProcesses/P2_uux_ttxgg/CPPProcess.h | 2 +- .../SubProcesses/P2_uux_ttxgg/auto_dsig.f | 2 +- .../SubProcesses/P2_uux_ttxgg/auto_dsig1.f | 4 +- .../SubProcesses/P2_uux_ttxgg/matrix1.f | 4 +- .../SubProcesses/P2_uux_ttxuux/CPPProcess.cc | 2 +- .../SubProcesses/P2_uux_ttxuux/CPPProcess.h | 2 +- .../SubProcesses/P2_uux_ttxuux/auto_dsig.f | 2 +- .../SubProcesses/P2_uux_ttxuux/auto_dsig1.f | 4 +- .../SubProcesses/P2_uux_ttxuux/matrix1.f | 4 +- .../P2_uxcx_ttxuxcx/CPPProcess.cc | 2 +- .../SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h | 2 +- .../SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f | 2 +- .../SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f | 4 +- .../SubProcesses/P2_uxcx_ttxuxcx/matrix1.f | 4 +- .../P2_uxux_ttxuxux/CPPProcess.cc | 2 +- .../SubProcesses/P2_uxux_ttxuxux/CPPProcess.h | 2 +- .../SubProcesses/P2_uxux_ttxuxux/auto_dsig.f | 2 +- .../SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f | 4 +- .../SubProcesses/P2_uxux_ttxuxux/matrix1.f | 4 +- .../pp_tt012j.mad/SubProcesses/genps.f | 4 +- .../pp_tt012j.mad/bin/internal/__init__.py | 1 + .../pp_tt012j.mad/bin/internal/banner.py | 5 +- .../bin/internal/common_run_interface.py | 17 ++++- .../bin/internal/gen_ximprove.py | 18 +++-- .../bin/internal/launch_plugin.py | 4 +- .../bin/internal/madevent_interface.py | 25 ++++--- .../pp_tt012j.mad/bin/internal/misc.py | 2 +- .../pp_tt012j.mad/bin/internal/shower_card.py | 10 ++- epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h | 2 +- .../pp_tt012j.mad/src/Parameters_sm.cc | 2 +- .../cudacpp/pp_tt012j.mad/src/Parameters_sm.h | 2 +- 307 files changed, 1017 insertions(+), 753 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index d5d0a77b77..e6546f684c 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005647420883178711  +DEBUG: model prefixing takes 0.005372047424316406  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -161,10 +161,10 @@ Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  @@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,19 +191,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.100 s +Wrote files for 8 helas calls in 0.098 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.203 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +ALOHA: aloha creates 3 routines in 0.200 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.267 s +ALOHA: aloha creates 7 routines in 0.255 s FFV1 FFV1 FFV2 @@ -226,12 +226,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile patching file bin/internal/gen_ximprove.py +Hunk #1 succeeded at 391 (offset 6 lines). patching file bin/internal/madevent_interface.py DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file auto_dsig1.f @@ -241,16 +242,16 @@ patching file matrix1.f Hunk #3 succeeded at 230 (offset 9 lines). Hunk #4 succeeded at 267 (offset 18 lines). Hunk #5 succeeded at 312 (offset 18 lines). -DEBUG: p.returncode =  0 [output.py at line 235]  +DEBUG: p.returncode =  0 [output.py at line 232]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m2.189s +real 0m4.853s user 0m1.653s -sys 0m0.232s +sys 0m0.201s ************************************************************ * * * W E L C O M E to * @@ -263,7 +264,7 @@ sys 0m0.232s * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -297,7 +298,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat index b9e01f684b..618adbca06 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.5.1_lo_vect 2023-08-08 * +#* VERSION 3.5.2_lo_vect 2023-11-08 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt +++ b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 0af629d3a8..fc293da1de 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h index f2ef5c1e14..77b610753c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f index f78f7c102e..02520466e6 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f index fcf2e4dec5..4188745070 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f index 21e300b33e..1991a72bb9 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -319,7 +319,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f index fe9c61504b..c00e33d954 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f @@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config) d1 = iforest(1, -i, config) d2 = iforest(2, -i, config) do j=0,3 - if (d1.gt.0.and.d1.le.2) then + if (d1.gt.0.and.d1.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1) endif - if (d2.gt.0.and.d2.le.2) then + if (d2.gt.0.and.d2.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py index 16e60d8182..0d17042f0d 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py @@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error): import os import logging import time +pjoin = os.path.join #Look for basic file position MG5DIR and MG4DIR MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py index f0d38c2e5a..3995ce8109 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py @@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): continue break + if proc_characteristic['ninitial'] == 1: + self['SDE_strategy'] =1 + if 'MLM' in proc_characteristic['limitations']: if self['dynamical_scale_choice'] == -1: self['dynamical_scale_choice'] = 3 @@ -5942,7 +5945,7 @@ def default_setup(self): self.add_param("CheckCycle", 3) self.add_param("MaxAttempts", 10) self.add_param("ZeroThres", 1e-9) - self.add_param("OSThres", 1.0e-13) + self.add_param("OSThres", 1.0e-8) self.add_param("DoubleCheckHelicityFilter", True) self.add_param("WriteOutFilters", True) self.add_param("UseLoopFilter", False) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py index 14c7f310dc..87cb4b88df 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py @@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto self.load_default() self.define_paths(**opt) self.last_editline_pos = 0 + self.update_dependent_done = False if 'allow_arg' not in opt or not opt['allow_arg']: # add some mininal content for this: @@ -6585,7 +6586,9 @@ def postcmd(self, stop, line): self.check_card_consistency() if self.param_consistency: try: - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) + self.update_dependent_done = False except MadGraph5Error as error: if 'Missing block:' in str(error): self.fail_due_to_format +=1 @@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0): self.update_dependent(self.mother_interface, self.me_dir, self.param_card, self.paths['param'], timer, run_card=self.run_card, lhapdfconfig=self.lhapdf) + self.update_dependent_done = True + elif args[0] == 'missing': self.update_missing() @@ -6717,12 +6722,13 @@ class TimeOutError(Exception): def handle_alarm(signum, frame): raise TimeOutError signal.signal(signal.SIGALRM, handle_alarm) + if timer: - signal.alarm(timer) log_level=30 else: log_level=20 + if run_card: as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, @@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame): logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s", as_for_pdf[pdlabel]) modify = True + if timer: + signal.alarm(timer) + + # Try to load the model in the limited amount of time allowed try: model = mecmd.get_model() @@ -6909,7 +6919,8 @@ def check_block(self, blockname): def check_answer_consistency(self): """function called if the code reads a file""" self.check_card_consistency() - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) def help_set(self): '''help message for set''' diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py index a88d60b282..5fd170d18d 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py @@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True): (stdout, _) = p.communicate(''.encode()) stdout = stdout.decode('ascii',errors='ignore') - try: + if stdout: nb_channel = max([math.floor(float(d)) for d in stdout.split()]) - except Exception as error: - misc.sprint(stdout, 'no channel or error for %s' % Pdir) - continue - + else: + for matrix_file in misc.glob('matrix*orig.f', Pdir): + files.cp(matrix_file, matrix_file.replace('orig','optim')) + P_zero_result.append(Pdir) + if os.path.exists(pjoin(self.me_dir, 'error')): + os.remove(pjoin(self.me_dir, 'error')) + continue # bypass bad process + self.cmd.compile(['madevent_forhel'], cwd=Pdir) if not os.path.exists(pjoin(Pdir, 'madevent_forhel')): raise Exception('Error make madevent_forhel not successful') @@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True): #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts (stdout, _) = p.communicate(" ".encode()) stdout = stdout.decode('ascii',errors='ignore') - if os.path.exists(pjoin(self.me_dir,'error')): + if os.path.exists(pjoin(self.me_dir, 'error')): raise Exception(pjoin(self.me_dir,'error')) # note a continue is not enough here, we have in top to link # the matrixX_optim.f to matrixX_orig.f to let the code to work # after this error. + # for matrix_file in misc.glob('matrix*orig.f', Pdir): + # files.cp(matrix_file, matrix_file.replace('orig','optim')) if 'no events passed cuts' in stdout: raise Exception diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py index c9d1c7706a..0b849330ef 100644 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py @@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name): return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - + def plugin_input(self, finput): return @@ -79,7 +79,7 @@ def check_validity(self): self['sde_strategy'] = 1 if self['hel_recycling']: self['hel_recycling'] = False - + class GPURunCard(CPPRunCard): def default_setup(self): super(CPPRunCard, self).default_setup() diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py index d722702891..853aabc98a 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py @@ -1,4 +1,4 @@ -################################################################################ +############################################################################### # # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors # @@ -3675,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_combine_iteration(self, line): + def do_comine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line): ############################################################################ def do_combine_events(self, line): """Advanced commands: Launch combine events""" - + start=time.time() args = self.split_arg(line) start = time.time() # Check argument's validity @@ -3798,9 +3798,7 @@ def do_combine_events(self, line): self.correct_bias() elif self.run_card['custom_fcts']: self.correct_bias() - - logger.info("combine events done in %s", time.time()-start) - + logger.info("combination of events done in %s s ", time.time()-start) self.to_store.append('event') @@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done): import optparse # Get the directory of the script real path (bin) # and add it to the current PYTHONPATH - root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))) + #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))) sys.path.insert(0, root_path) class MyOptParser(optparse.OptionParser): @@ -7411,7 +7409,13 @@ def error(self, msg=''): import logging.config # Set logging level according to the logging level given by options #logging.basicConfig(level=vars(logging)[options.logging]) + import internal import internal.coloring_logging + # internal.file = XXX/bin/internal/__init__.py + # => need three dirname to get XXX + # we use internal to have any issue with pythonpath finding the wrong file + me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__))) + print("me_dir is", me_dir) try: if __debug__ and options.logging == 'INFO': options.logging = 'DEBUG' @@ -7419,7 +7423,8 @@ def error(self, msg=''): level = int(options.logging) else: level = eval('logging.' + options.logging) - logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf')) + log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf') + logging.config.fileConfig(log_path) logging.root.setLevel(level) logging.getLogger('madgraph').setLevel(level) except: @@ -7433,9 +7438,9 @@ def error(self, msg=''): if '--web' in args: i = args.index('--web') args.pop(i) - cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmd(me_dir, force_run=True) else: - cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmdShell(me_dir, force_run=True) if not hasattr(cmd_line, 'do_%s' % args[0]): if parser_error: print(parser_error) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py index d3fed3baa2..91cd3e5c22 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py @@ -347,7 +347,7 @@ def tell(msg): if dependency=='ninja': if cmd.options['ninja'] in ['None',None,''] or\ (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\ - which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None): + which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None): tell("Installing ninja...") cmd.do_install('ninja') diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py index c6d3948cc4..c344ea1b15 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py @@ -45,7 +45,9 @@ class ShowerCard(dict): false = ['.false.', 'f', 'false', '0'] logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', - 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td'] + 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td', + 'space_shower_me_corrections', 'time_shower_me_corrections', + 'time_shower_me_extended', 'time_shower_me_after_first'] string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse'] for i in range(1,100): string_vars.append('dm_'+str(i)) @@ -82,7 +84,11 @@ class ShowerCard(dict): 'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'}, 'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'}, 'qcut' : {'PYTHIA8':'qcut'}, - 'njmax' : {'PYTHIA8':'njmax'}} + 'njmax' : {'PYTHIA8':'njmax'}, + 'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'}, + 'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'}, + 'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'}, + 'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}} stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'} diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h index 19819e2451..9fa30cfd7f 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc index 31f620c44e..0b4be4d5ed 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h index 521831ce4a..64d0b8e761 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index ccb39ba2cc..8cb80f0d38 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005671501159667969  +DEBUG: model prefixing takes 0.005633831024169922  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -160,28 +160,28 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  -DEBUG: type(subproc_group)= [output.py at line 193]  -DEBUG: type(fortran_model)= [output.py at line 194]  -DEBUG: type(me)= me=0 [output.py at line 195]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  +DEBUG: type(subproc_group)= [output.py at line 190]  +DEBUG: type(fortran_model)= [output.py at line 191]  +DEBUG: type(me)= me=0 [output.py at line 192]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.272 s +ALOHA: aloha creates 4 routines in 0.267 s FFV1 FFV1 FFV2 @@ -198,9 +198,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m0.795s -user 0m0.698s -sys 0m0.066s +real 0m3.653s +user 0m0.601s +sys 0m0.049s diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index c0ab4edb92..684bd53bf5 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index f2ef5c1e14..77b610753c 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h index 19819e2451..9fa30cfd7f 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc index 31f620c44e..0b4be4d5ed 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h index 521831ce4a..64d0b8e761 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index b0eb76c9f4..a1fa47508f 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005650758743286133  +DEBUG: model prefixing takes 0.005694150924682617  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.107 s +Wrote files for 10 helas calls in 0.101 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.152 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +ALOHA: aloha creates 2 routines in 0.145 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.140 s +ALOHA: aloha creates 4 routines in 0.132 s VVV1 FFV1 FFV1 @@ -219,27 +219,28 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile patching file bin/internal/gen_ximprove.py +Hunk #1 succeeded at 391 (offset 6 lines). patching file bin/internal/madevent_interface.py DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file auto_dsig1.f patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 235]  +DEBUG: p.returncode =  0 [output.py at line 232]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m1.780s -user 0m1.544s -sys 0m0.218s +real 0m4.772s +user 0m1.470s +sys 0m0.223s ************************************************************ * * * W E L C O M E to * @@ -252,7 +253,7 @@ sys 0m0.218s * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -285,7 +286,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat index 2a2fd25453..4c14989a3f 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.5.1_lo_vect 2023-08-08 * +#* VERSION 3.5.2_lo_vect 2023-11-08 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 02f655f48c..d2e7a3c91d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 0c2d2b0687..3ebd92c038 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index fe184caddf..d80d770784 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index 5a3da931f2..9346ee4c6a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f index daea73a6df..0c2ce6ec40 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -301,7 +301,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f index fe9c61504b..c00e33d954 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f @@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config) d1 = iforest(1, -i, config) d2 = iforest(2, -i, config) do j=0,3 - if (d1.gt.0.and.d1.le.2) then + if (d1.gt.0.and.d1.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1) endif - if (d2.gt.0.and.d2.le.2) then + if (d2.gt.0.and.d2.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py index 16e60d8182..0d17042f0d 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py @@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error): import os import logging import time +pjoin = os.path.join #Look for basic file position MG5DIR and MG4DIR MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py index f0d38c2e5a..3995ce8109 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py @@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): continue break + if proc_characteristic['ninitial'] == 1: + self['SDE_strategy'] =1 + if 'MLM' in proc_characteristic['limitations']: if self['dynamical_scale_choice'] == -1: self['dynamical_scale_choice'] = 3 @@ -5942,7 +5945,7 @@ def default_setup(self): self.add_param("CheckCycle", 3) self.add_param("MaxAttempts", 10) self.add_param("ZeroThres", 1e-9) - self.add_param("OSThres", 1.0e-13) + self.add_param("OSThres", 1.0e-8) self.add_param("DoubleCheckHelicityFilter", True) self.add_param("WriteOutFilters", True) self.add_param("UseLoopFilter", False) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py index 14c7f310dc..87cb4b88df 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py @@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto self.load_default() self.define_paths(**opt) self.last_editline_pos = 0 + self.update_dependent_done = False if 'allow_arg' not in opt or not opt['allow_arg']: # add some mininal content for this: @@ -6585,7 +6586,9 @@ def postcmd(self, stop, line): self.check_card_consistency() if self.param_consistency: try: - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) + self.update_dependent_done = False except MadGraph5Error as error: if 'Missing block:' in str(error): self.fail_due_to_format +=1 @@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0): self.update_dependent(self.mother_interface, self.me_dir, self.param_card, self.paths['param'], timer, run_card=self.run_card, lhapdfconfig=self.lhapdf) + self.update_dependent_done = True + elif args[0] == 'missing': self.update_missing() @@ -6717,12 +6722,13 @@ class TimeOutError(Exception): def handle_alarm(signum, frame): raise TimeOutError signal.signal(signal.SIGALRM, handle_alarm) + if timer: - signal.alarm(timer) log_level=30 else: log_level=20 + if run_card: as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, @@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame): logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s", as_for_pdf[pdlabel]) modify = True + if timer: + signal.alarm(timer) + + # Try to load the model in the limited amount of time allowed try: model = mecmd.get_model() @@ -6909,7 +6919,8 @@ def check_block(self, blockname): def check_answer_consistency(self): """function called if the code reads a file""" self.check_card_consistency() - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) def help_set(self): '''help message for set''' diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py index a88d60b282..5fd170d18d 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py @@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True): (stdout, _) = p.communicate(''.encode()) stdout = stdout.decode('ascii',errors='ignore') - try: + if stdout: nb_channel = max([math.floor(float(d)) for d in stdout.split()]) - except Exception as error: - misc.sprint(stdout, 'no channel or error for %s' % Pdir) - continue - + else: + for matrix_file in misc.glob('matrix*orig.f', Pdir): + files.cp(matrix_file, matrix_file.replace('orig','optim')) + P_zero_result.append(Pdir) + if os.path.exists(pjoin(self.me_dir, 'error')): + os.remove(pjoin(self.me_dir, 'error')) + continue # bypass bad process + self.cmd.compile(['madevent_forhel'], cwd=Pdir) if not os.path.exists(pjoin(Pdir, 'madevent_forhel')): raise Exception('Error make madevent_forhel not successful') @@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True): #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts (stdout, _) = p.communicate(" ".encode()) stdout = stdout.decode('ascii',errors='ignore') - if os.path.exists(pjoin(self.me_dir,'error')): + if os.path.exists(pjoin(self.me_dir, 'error')): raise Exception(pjoin(self.me_dir,'error')) # note a continue is not enough here, we have in top to link # the matrixX_optim.f to matrixX_orig.f to let the code to work # after this error. + # for matrix_file in misc.glob('matrix*orig.f', Pdir): + # files.cp(matrix_file, matrix_file.replace('orig','optim')) if 'no events passed cuts' in stdout: raise Exception diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index c9d1c7706a..0b849330ef 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name): return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - + def plugin_input(self, finput): return @@ -79,7 +79,7 @@ def check_validity(self): self['sde_strategy'] = 1 if self['hel_recycling']: self['hel_recycling'] = False - + class GPURunCard(CPPRunCard): def default_setup(self): super(CPPRunCard, self).default_setup() diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py index d722702891..853aabc98a 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py @@ -1,4 +1,4 @@ -################################################################################ +############################################################################### # # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors # @@ -3675,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_combine_iteration(self, line): + def do_comine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line): ############################################################################ def do_combine_events(self, line): """Advanced commands: Launch combine events""" - + start=time.time() args = self.split_arg(line) start = time.time() # Check argument's validity @@ -3798,9 +3798,7 @@ def do_combine_events(self, line): self.correct_bias() elif self.run_card['custom_fcts']: self.correct_bias() - - logger.info("combine events done in %s", time.time()-start) - + logger.info("combination of events done in %s s ", time.time()-start) self.to_store.append('event') @@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done): import optparse # Get the directory of the script real path (bin) # and add it to the current PYTHONPATH - root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))) + #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))) sys.path.insert(0, root_path) class MyOptParser(optparse.OptionParser): @@ -7411,7 +7409,13 @@ def error(self, msg=''): import logging.config # Set logging level according to the logging level given by options #logging.basicConfig(level=vars(logging)[options.logging]) + import internal import internal.coloring_logging + # internal.file = XXX/bin/internal/__init__.py + # => need three dirname to get XXX + # we use internal to have any issue with pythonpath finding the wrong file + me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__))) + print("me_dir is", me_dir) try: if __debug__ and options.logging == 'INFO': options.logging = 'DEBUG' @@ -7419,7 +7423,8 @@ def error(self, msg=''): level = int(options.logging) else: level = eval('logging.' + options.logging) - logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf')) + log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf') + logging.config.fileConfig(log_path) logging.root.setLevel(level) logging.getLogger('madgraph').setLevel(level) except: @@ -7433,9 +7438,9 @@ def error(self, msg=''): if '--web' in args: i = args.index('--web') args.pop(i) - cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmd(me_dir, force_run=True) else: - cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmdShell(me_dir, force_run=True) if not hasattr(cmd_line, 'do_%s' % args[0]): if parser_error: print(parser_error) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py b/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py index d3fed3baa2..91cd3e5c22 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py @@ -347,7 +347,7 @@ def tell(msg): if dependency=='ninja': if cmd.options['ninja'] in ['None',None,''] or\ (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\ - which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None): + which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None): tell("Installing ninja...") cmd.do_install('ninja') diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py index c6d3948cc4..c344ea1b15 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py @@ -45,7 +45,9 @@ class ShowerCard(dict): false = ['.false.', 'f', 'false', '0'] logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', - 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td'] + 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td', + 'space_shower_me_corrections', 'time_shower_me_corrections', + 'time_shower_me_extended', 'time_shower_me_after_first'] string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse'] for i in range(1,100): string_vars.append('dm_'+str(i)) @@ -82,7 +84,11 @@ class ShowerCard(dict): 'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'}, 'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'}, 'qcut' : {'PYTHIA8':'qcut'}, - 'njmax' : {'PYTHIA8':'njmax'}} + 'njmax' : {'PYTHIA8':'njmax'}, + 'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'}, + 'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'}, + 'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'}, + 'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}} stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'} diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h index 07d0bfa887..55f43bb43a 100644 --- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc index 3452d1e8da..a9bc93ff98 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index 4f6f322ed9..932f123fea 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 27709b8f4f..805df19bd9 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005872249603271484  +DEBUG: model prefixing takes 0.00567626953125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -161,26 +161,26 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  -DEBUG: type(subproc_group)= [output.py at line 193]  -DEBUG: type(fortran_model)= [output.py at line 194]  -DEBUG: type(me)= me=0 [output.py at line 195]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  +DEBUG: type(subproc_group)= [output.py at line 190]  +DEBUG: type(fortran_model)= [output.py at line 191]  +DEBUG: type(me)= me=0 [output.py at line 192]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.148 s +ALOHA: aloha creates 2 routines in 0.143 s VVV1 FFV1 FFV1 @@ -193,9 +193,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m0.565s -user 0m0.498s -sys 0m0.040s +real 0m3.529s +user 0m0.478s +sys 0m0.048s diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 141d1f24ac..0e44ef42c3 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index 0c2d2b0687..3ebd92c038 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h index 07d0bfa887..55f43bb43a 100644 --- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc index 3452d1e8da..a9bc93ff98 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h index 4f6f322ed9..932f123fea 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 0eefbc9b91..9d4dbd85f0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005677461624145508  +DEBUG: model prefixing takes 0.005400419235229492  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,17 +163,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.020 s +1 processes with 16 diagrams generated in 0.019 s Total: 2 processes with 19 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  @@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,23 +217,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s -Wrote files for 46 helas calls in 0.249 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s +Wrote files for 46 helas calls in 0.242 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.331 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +ALOHA: aloha creates 5 routines in 0.324 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.314 s +ALOHA: aloha creates 10 routines in 0.308 s VVV1 VVV1 FFV1 @@ -257,12 +257,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile patching file bin/internal/gen_ximprove.py +Hunk #1 succeeded at 391 (offset 6 lines). patching file bin/internal/madevent_interface.py DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file auto_dsig1.f @@ -276,16 +277,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 235]  +DEBUG: p.returncode =  0 [output.py at line 232]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m2.345s -user 0m2.078s -sys 0m0.243s +real 0m5.282s +user 0m2.049s +sys 0m0.227s ************************************************************ * * * W E L C O M E to * @@ -298,7 +299,7 @@ sys 0m0.243s * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -331,7 +332,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat index cdb64729b1..d0845f65f5 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.5.1_lo_vect 2023-08-08 * +#* VERSION 3.5.2_lo_vect 2023-11-08 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 02f655f48c..d2e7a3c91d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 0c2d2b0687..3ebd92c038 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index fe184caddf..d80d770784 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index 5a3da931f2..9346ee4c6a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f index daea73a6df..0c2ce6ec40 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -301,7 +301,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index ce1badffca..1e24c2819d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h index 248ed1ec9e..3901ddcb20 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f index f751e9f14a..53ca75eaf4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f index 6eb0fa0827..d6c6f42c9e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f index 02f406668c..5c91f2448c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -317,7 +317,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f index fe9c61504b..c00e33d954 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f @@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config) d1 = iforest(1, -i, config) d2 = iforest(2, -i, config) do j=0,3 - if (d1.gt.0.and.d1.le.2) then + if (d1.gt.0.and.d1.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1) endif - if (d2.gt.0.and.d2.le.2) then + if (d2.gt.0.and.d2.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py index 16e60d8182..0d17042f0d 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py @@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error): import os import logging import time +pjoin = os.path.join #Look for basic file position MG5DIR and MG4DIR MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py index f0d38c2e5a..3995ce8109 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py @@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): continue break + if proc_characteristic['ninitial'] == 1: + self['SDE_strategy'] =1 + if 'MLM' in proc_characteristic['limitations']: if self['dynamical_scale_choice'] == -1: self['dynamical_scale_choice'] = 3 @@ -5942,7 +5945,7 @@ def default_setup(self): self.add_param("CheckCycle", 3) self.add_param("MaxAttempts", 10) self.add_param("ZeroThres", 1e-9) - self.add_param("OSThres", 1.0e-13) + self.add_param("OSThres", 1.0e-8) self.add_param("DoubleCheckHelicityFilter", True) self.add_param("WriteOutFilters", True) self.add_param("UseLoopFilter", False) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py index 14c7f310dc..87cb4b88df 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py @@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto self.load_default() self.define_paths(**opt) self.last_editline_pos = 0 + self.update_dependent_done = False if 'allow_arg' not in opt or not opt['allow_arg']: # add some mininal content for this: @@ -6585,7 +6586,9 @@ def postcmd(self, stop, line): self.check_card_consistency() if self.param_consistency: try: - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) + self.update_dependent_done = False except MadGraph5Error as error: if 'Missing block:' in str(error): self.fail_due_to_format +=1 @@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0): self.update_dependent(self.mother_interface, self.me_dir, self.param_card, self.paths['param'], timer, run_card=self.run_card, lhapdfconfig=self.lhapdf) + self.update_dependent_done = True + elif args[0] == 'missing': self.update_missing() @@ -6717,12 +6722,13 @@ class TimeOutError(Exception): def handle_alarm(signum, frame): raise TimeOutError signal.signal(signal.SIGALRM, handle_alarm) + if timer: - signal.alarm(timer) log_level=30 else: log_level=20 + if run_card: as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, @@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame): logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s", as_for_pdf[pdlabel]) modify = True + if timer: + signal.alarm(timer) + + # Try to load the model in the limited amount of time allowed try: model = mecmd.get_model() @@ -6909,7 +6919,8 @@ def check_block(self, blockname): def check_answer_consistency(self): """function called if the code reads a file""" self.check_card_consistency() - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) def help_set(self): '''help message for set''' diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py index a88d60b282..5fd170d18d 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py @@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True): (stdout, _) = p.communicate(''.encode()) stdout = stdout.decode('ascii',errors='ignore') - try: + if stdout: nb_channel = max([math.floor(float(d)) for d in stdout.split()]) - except Exception as error: - misc.sprint(stdout, 'no channel or error for %s' % Pdir) - continue - + else: + for matrix_file in misc.glob('matrix*orig.f', Pdir): + files.cp(matrix_file, matrix_file.replace('orig','optim')) + P_zero_result.append(Pdir) + if os.path.exists(pjoin(self.me_dir, 'error')): + os.remove(pjoin(self.me_dir, 'error')) + continue # bypass bad process + self.cmd.compile(['madevent_forhel'], cwd=Pdir) if not os.path.exists(pjoin(Pdir, 'madevent_forhel')): raise Exception('Error make madevent_forhel not successful') @@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True): #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts (stdout, _) = p.communicate(" ".encode()) stdout = stdout.decode('ascii',errors='ignore') - if os.path.exists(pjoin(self.me_dir,'error')): + if os.path.exists(pjoin(self.me_dir, 'error')): raise Exception(pjoin(self.me_dir,'error')) # note a continue is not enough here, we have in top to link # the matrixX_optim.f to matrixX_orig.f to let the code to work # after this error. + # for matrix_file in misc.glob('matrix*orig.f', Pdir): + # files.cp(matrix_file, matrix_file.replace('orig','optim')) if 'no events passed cuts' in stdout: raise Exception diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py index c9d1c7706a..0b849330ef 100644 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py @@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name): return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - + def plugin_input(self, finput): return @@ -79,7 +79,7 @@ def check_validity(self): self['sde_strategy'] = 1 if self['hel_recycling']: self['hel_recycling'] = False - + class GPURunCard(CPPRunCard): def default_setup(self): super(CPPRunCard, self).default_setup() diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py index d722702891..853aabc98a 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py @@ -1,4 +1,4 @@ -################################################################################ +############################################################################### # # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors # @@ -3675,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_combine_iteration(self, line): + def do_comine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line): ############################################################################ def do_combine_events(self, line): """Advanced commands: Launch combine events""" - + start=time.time() args = self.split_arg(line) start = time.time() # Check argument's validity @@ -3798,9 +3798,7 @@ def do_combine_events(self, line): self.correct_bias() elif self.run_card['custom_fcts']: self.correct_bias() - - logger.info("combine events done in %s", time.time()-start) - + logger.info("combination of events done in %s s ", time.time()-start) self.to_store.append('event') @@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done): import optparse # Get the directory of the script real path (bin) # and add it to the current PYTHONPATH - root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))) + #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))) sys.path.insert(0, root_path) class MyOptParser(optparse.OptionParser): @@ -7411,7 +7409,13 @@ def error(self, msg=''): import logging.config # Set logging level according to the logging level given by options #logging.basicConfig(level=vars(logging)[options.logging]) + import internal import internal.coloring_logging + # internal.file = XXX/bin/internal/__init__.py + # => need three dirname to get XXX + # we use internal to have any issue with pythonpath finding the wrong file + me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__))) + print("me_dir is", me_dir) try: if __debug__ and options.logging == 'INFO': options.logging = 'DEBUG' @@ -7419,7 +7423,8 @@ def error(self, msg=''): level = int(options.logging) else: level = eval('logging.' + options.logging) - logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf')) + log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf') + logging.config.fileConfig(log_path) logging.root.setLevel(level) logging.getLogger('madgraph').setLevel(level) except: @@ -7433,9 +7438,9 @@ def error(self, msg=''): if '--web' in args: i = args.index('--web') args.pop(i) - cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmd(me_dir, force_run=True) else: - cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmdShell(me_dir, force_run=True) if not hasattr(cmd_line, 'do_%s' % args[0]): if parser_error: print(parser_error) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py index d3fed3baa2..91cd3e5c22 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py @@ -347,7 +347,7 @@ def tell(msg): if dependency=='ninja': if cmd.options['ninja'] in ['None',None,''] or\ (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\ - which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None): + which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None): tell("Installing ninja...") cmd.do_install('ninja') diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py index c6d3948cc4..c344ea1b15 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py @@ -45,7 +45,9 @@ class ShowerCard(dict): false = ['.false.', 'f', 'false', '0'] logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', - 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td'] + 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td', + 'space_shower_me_corrections', 'time_shower_me_corrections', + 'time_shower_me_extended', 'time_shower_me_after_first'] string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse'] for i in range(1,100): string_vars.append('dm_'+str(i)) @@ -82,7 +84,11 @@ class ShowerCard(dict): 'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'}, 'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'}, 'qcut' : {'PYTHIA8':'qcut'}, - 'njmax' : {'PYTHIA8':'njmax'}} + 'njmax' : {'PYTHIA8':'njmax'}, + 'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'}, + 'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'}, + 'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'}, + 'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}} stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'} diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h index 8995b15c82..361b488401 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc index 9d09eb6b62..64fc3fea62 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h index 6b32c66b9b..b6568d3761 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 740186af78..68afa8d9b0 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005747556686401367  +DEBUG: model prefixing takes 0.005378007888793945  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,23 +190,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s -Wrote files for 36 helas calls in 0.152 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s +Wrote files for 36 helas calls in 0.148 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.332 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +ALOHA: aloha creates 5 routines in 0.323 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.326 s +ALOHA: aloha creates 10 routines in 0.309 s VVV1 VVV1 FFV1 @@ -230,12 +230,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile patching file bin/internal/gen_ximprove.py +Hunk #1 succeeded at 391 (offset 6 lines). patching file bin/internal/madevent_interface.py DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file auto_dsig1.f @@ -245,16 +246,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 235]  +DEBUG: p.returncode =  0 [output.py at line 232]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m2.221s -user 0m1.964s -sys 0m0.245s +real 0m5.147s +user 0m1.924s +sys 0m0.225s ************************************************************ * * * W E L C O M E to * @@ -267,7 +268,7 @@ sys 0m0.245s * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -300,7 +301,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat index 3af4991f01..a0ffbbc219 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.5.1_lo_vect 2023-08-08 * +#* VERSION 3.5.2_lo_vect 2023-11-08 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index f7f5899260..5e2bf0d19a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 9f559fe3ae..37d6ebe981 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f index d528b1d2f0..dd4cd3a0c2 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f index 110e204c24..e28575ead8 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f index bf665ff6e0..a885b7fde3 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -317,7 +317,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f index fe9c61504b..c00e33d954 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f @@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config) d1 = iforest(1, -i, config) d2 = iforest(2, -i, config) do j=0,3 - if (d1.gt.0.and.d1.le.2) then + if (d1.gt.0.and.d1.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1) endif - if (d2.gt.0.and.d2.le.2) then + if (d2.gt.0.and.d2.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py index 16e60d8182..0d17042f0d 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py @@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error): import os import logging import time +pjoin = os.path.join #Look for basic file position MG5DIR and MG4DIR MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py index f0d38c2e5a..3995ce8109 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py @@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): continue break + if proc_characteristic['ninitial'] == 1: + self['SDE_strategy'] =1 + if 'MLM' in proc_characteristic['limitations']: if self['dynamical_scale_choice'] == -1: self['dynamical_scale_choice'] = 3 @@ -5942,7 +5945,7 @@ def default_setup(self): self.add_param("CheckCycle", 3) self.add_param("MaxAttempts", 10) self.add_param("ZeroThres", 1e-9) - self.add_param("OSThres", 1.0e-13) + self.add_param("OSThres", 1.0e-8) self.add_param("DoubleCheckHelicityFilter", True) self.add_param("WriteOutFilters", True) self.add_param("UseLoopFilter", False) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py index 14c7f310dc..87cb4b88df 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py @@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto self.load_default() self.define_paths(**opt) self.last_editline_pos = 0 + self.update_dependent_done = False if 'allow_arg' not in opt or not opt['allow_arg']: # add some mininal content for this: @@ -6585,7 +6586,9 @@ def postcmd(self, stop, line): self.check_card_consistency() if self.param_consistency: try: - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) + self.update_dependent_done = False except MadGraph5Error as error: if 'Missing block:' in str(error): self.fail_due_to_format +=1 @@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0): self.update_dependent(self.mother_interface, self.me_dir, self.param_card, self.paths['param'], timer, run_card=self.run_card, lhapdfconfig=self.lhapdf) + self.update_dependent_done = True + elif args[0] == 'missing': self.update_missing() @@ -6717,12 +6722,13 @@ class TimeOutError(Exception): def handle_alarm(signum, frame): raise TimeOutError signal.signal(signal.SIGALRM, handle_alarm) + if timer: - signal.alarm(timer) log_level=30 else: log_level=20 + if run_card: as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, @@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame): logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s", as_for_pdf[pdlabel]) modify = True + if timer: + signal.alarm(timer) + + # Try to load the model in the limited amount of time allowed try: model = mecmd.get_model() @@ -6909,7 +6919,8 @@ def check_block(self, blockname): def check_answer_consistency(self): """function called if the code reads a file""" self.check_card_consistency() - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) def help_set(self): '''help message for set''' diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py index a88d60b282..5fd170d18d 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py @@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True): (stdout, _) = p.communicate(''.encode()) stdout = stdout.decode('ascii',errors='ignore') - try: + if stdout: nb_channel = max([math.floor(float(d)) for d in stdout.split()]) - except Exception as error: - misc.sprint(stdout, 'no channel or error for %s' % Pdir) - continue - + else: + for matrix_file in misc.glob('matrix*orig.f', Pdir): + files.cp(matrix_file, matrix_file.replace('orig','optim')) + P_zero_result.append(Pdir) + if os.path.exists(pjoin(self.me_dir, 'error')): + os.remove(pjoin(self.me_dir, 'error')) + continue # bypass bad process + self.cmd.compile(['madevent_forhel'], cwd=Pdir) if not os.path.exists(pjoin(Pdir, 'madevent_forhel')): raise Exception('Error make madevent_forhel not successful') @@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True): #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts (stdout, _) = p.communicate(" ".encode()) stdout = stdout.decode('ascii',errors='ignore') - if os.path.exists(pjoin(self.me_dir,'error')): + if os.path.exists(pjoin(self.me_dir, 'error')): raise Exception(pjoin(self.me_dir,'error')) # note a continue is not enough here, we have in top to link # the matrixX_optim.f to matrixX_orig.f to let the code to work # after this error. + # for matrix_file in misc.glob('matrix*orig.f', Pdir): + # files.cp(matrix_file, matrix_file.replace('orig','optim')) if 'no events passed cuts' in stdout: raise Exception diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py index c9d1c7706a..0b849330ef 100644 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py @@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name): return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - + def plugin_input(self, finput): return @@ -79,7 +79,7 @@ def check_validity(self): self['sde_strategy'] = 1 if self['hel_recycling']: self['hel_recycling'] = False - + class GPURunCard(CPPRunCard): def default_setup(self): super(CPPRunCard, self).default_setup() diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py index d722702891..853aabc98a 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py @@ -1,4 +1,4 @@ -################################################################################ +############################################################################### # # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors # @@ -3675,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_combine_iteration(self, line): + def do_comine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line): ############################################################################ def do_combine_events(self, line): """Advanced commands: Launch combine events""" - + start=time.time() args = self.split_arg(line) start = time.time() # Check argument's validity @@ -3798,9 +3798,7 @@ def do_combine_events(self, line): self.correct_bias() elif self.run_card['custom_fcts']: self.correct_bias() - - logger.info("combine events done in %s", time.time()-start) - + logger.info("combination of events done in %s s ", time.time()-start) self.to_store.append('event') @@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done): import optparse # Get the directory of the script real path (bin) # and add it to the current PYTHONPATH - root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))) + #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))) sys.path.insert(0, root_path) class MyOptParser(optparse.OptionParser): @@ -7411,7 +7409,13 @@ def error(self, msg=''): import logging.config # Set logging level according to the logging level given by options #logging.basicConfig(level=vars(logging)[options.logging]) + import internal import internal.coloring_logging + # internal.file = XXX/bin/internal/__init__.py + # => need three dirname to get XXX + # we use internal to have any issue with pythonpath finding the wrong file + me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__))) + print("me_dir is", me_dir) try: if __debug__ and options.logging == 'INFO': options.logging = 'DEBUG' @@ -7419,7 +7423,8 @@ def error(self, msg=''): level = int(options.logging) else: level = eval('logging.' + options.logging) - logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf')) + log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf') + logging.config.fileConfig(log_path) logging.root.setLevel(level) logging.getLogger('madgraph').setLevel(level) except: @@ -7433,9 +7438,9 @@ def error(self, msg=''): if '--web' in args: i = args.index('--web') args.pop(i) - cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmd(me_dir, force_run=True) else: - cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmdShell(me_dir, force_run=True) if not hasattr(cmd_line, 'do_%s' % args[0]): if parser_error: print(parser_error) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py index d3fed3baa2..91cd3e5c22 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py @@ -347,7 +347,7 @@ def tell(msg): if dependency=='ninja': if cmd.options['ninja'] in ['None',None,''] or\ (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\ - which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None): + which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None): tell("Installing ninja...") cmd.do_install('ninja') diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py index c6d3948cc4..c344ea1b15 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py @@ -45,7 +45,9 @@ class ShowerCard(dict): false = ['.false.', 'f', 'false', '0'] logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', - 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td'] + 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td', + 'space_shower_me_corrections', 'time_shower_me_corrections', + 'time_shower_me_extended', 'time_shower_me_after_first'] string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse'] for i in range(1,100): string_vars.append('dm_'+str(i)) @@ -82,7 +84,11 @@ class ShowerCard(dict): 'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'}, 'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'}, 'qcut' : {'PYTHIA8':'qcut'}, - 'njmax' : {'PYTHIA8':'njmax'}} + 'njmax' : {'PYTHIA8':'njmax'}, + 'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'}, + 'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'}, + 'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'}, + 'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}} stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'} diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h index 8995b15c82..361b488401 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc index 9d09eb6b62..64fc3fea62 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h index 6b32c66b9b..b6568d3761 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index f795e1428d..97056958fe 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0055065155029296875  +DEBUG: model prefixing takes 0.005817890167236328  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -161,29 +161,29 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  -DEBUG: type(subproc_group)= [output.py at line 193]  -DEBUG: type(fortran_model)= [output.py at line 194]  -DEBUG: type(me)= me=0 [output.py at line 195]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  +DEBUG: type(subproc_group)= [output.py at line 190]  +DEBUG: type(fortran_model)= [output.py at line 191]  +DEBUG: type(me)= me=0 [output.py at line 192]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.328 s +ALOHA: aloha creates 5 routines in 0.326 s VVV1 VVV1 FFV1 @@ -201,9 +201,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m0.870s -user 0m0.728s -sys 0m0.055s +real 0m3.779s +user 0m0.713s +sys 0m0.062s diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 9393033e26..7f5e51681d 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h index 9f559fe3ae..37d6ebe981 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h index 8995b15c82..361b488401 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc index 9d09eb6b62..64fc3fea62 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h index 6b32c66b9b..b6568d3761 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 374e4defbb..eacd7a356a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005505084991455078  +DEBUG: model prefixing takes 0.0053293704986572266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.160 s +1 processes with 123 diagrams generated in 0.157 s Total: 1 processes with 123 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,23 +190,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.431 s -Wrote files for 222 helas calls in 0.704 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.425 s +Wrote files for 222 helas calls in 0.691 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.335 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +ALOHA: aloha creates 5 routines in 0.333 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.317 s +ALOHA: aloha creates 10 routines in 0.316 s VVV1 VVV1 FFV1 @@ -233,12 +233,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile patching file bin/internal/gen_ximprove.py +Hunk #1 succeeded at 391 (offset 6 lines). patching file bin/internal/madevent_interface.py DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file auto_dsig1.f @@ -248,16 +249,16 @@ Hunk #2 succeeded at 191 (offset 48 lines). Hunk #3 succeeded at 269 (offset 48 lines). Hunk #4 succeeded at 297 (offset 48 lines). Hunk #5 succeeded at 342 (offset 48 lines). -DEBUG: p.returncode =  0 [output.py at line 235]  +DEBUG: p.returncode =  0 [output.py at line 232]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m3.310s -user 0m3.061s -sys 0m0.239s +real 0m6.262s +user 0m3.028s +sys 0m0.232s ************************************************************ * * * W E L C O M E to * @@ -270,7 +271,7 @@ sys 0m0.239s * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -303,7 +304,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat index e4d3fe550f..b7568d1a73 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.5.1_lo_vect 2023-08-08 * +#* VERSION 3.5.2_lo_vect 2023-11-08 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 896d64343e..57dd4aed47 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h index d681eb7504..04f7c62976 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f index 9d747e6dc1..adf0afbe05 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f index 043887bde3..e4e527260c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f index df931e07c4..272c6bd97d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -349,7 +349,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f index fe9c61504b..c00e33d954 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f @@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config) d1 = iforest(1, -i, config) d2 = iforest(2, -i, config) do j=0,3 - if (d1.gt.0.and.d1.le.2) then + if (d1.gt.0.and.d1.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1) endif - if (d2.gt.0.and.d2.le.2) then + if (d2.gt.0.and.d2.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py index 16e60d8182..0d17042f0d 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py @@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error): import os import logging import time +pjoin = os.path.join #Look for basic file position MG5DIR and MG4DIR MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py index f0d38c2e5a..3995ce8109 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py @@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): continue break + if proc_characteristic['ninitial'] == 1: + self['SDE_strategy'] =1 + if 'MLM' in proc_characteristic['limitations']: if self['dynamical_scale_choice'] == -1: self['dynamical_scale_choice'] = 3 @@ -5942,7 +5945,7 @@ def default_setup(self): self.add_param("CheckCycle", 3) self.add_param("MaxAttempts", 10) self.add_param("ZeroThres", 1e-9) - self.add_param("OSThres", 1.0e-13) + self.add_param("OSThres", 1.0e-8) self.add_param("DoubleCheckHelicityFilter", True) self.add_param("WriteOutFilters", True) self.add_param("UseLoopFilter", False) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py index 14c7f310dc..87cb4b88df 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py @@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto self.load_default() self.define_paths(**opt) self.last_editline_pos = 0 + self.update_dependent_done = False if 'allow_arg' not in opt or not opt['allow_arg']: # add some mininal content for this: @@ -6585,7 +6586,9 @@ def postcmd(self, stop, line): self.check_card_consistency() if self.param_consistency: try: - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) + self.update_dependent_done = False except MadGraph5Error as error: if 'Missing block:' in str(error): self.fail_due_to_format +=1 @@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0): self.update_dependent(self.mother_interface, self.me_dir, self.param_card, self.paths['param'], timer, run_card=self.run_card, lhapdfconfig=self.lhapdf) + self.update_dependent_done = True + elif args[0] == 'missing': self.update_missing() @@ -6717,12 +6722,13 @@ class TimeOutError(Exception): def handle_alarm(signum, frame): raise TimeOutError signal.signal(signal.SIGALRM, handle_alarm) + if timer: - signal.alarm(timer) log_level=30 else: log_level=20 + if run_card: as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, @@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame): logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s", as_for_pdf[pdlabel]) modify = True + if timer: + signal.alarm(timer) + + # Try to load the model in the limited amount of time allowed try: model = mecmd.get_model() @@ -6909,7 +6919,8 @@ def check_block(self, blockname): def check_answer_consistency(self): """function called if the code reads a file""" self.check_card_consistency() - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) def help_set(self): '''help message for set''' diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py index a88d60b282..5fd170d18d 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py @@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True): (stdout, _) = p.communicate(''.encode()) stdout = stdout.decode('ascii',errors='ignore') - try: + if stdout: nb_channel = max([math.floor(float(d)) for d in stdout.split()]) - except Exception as error: - misc.sprint(stdout, 'no channel or error for %s' % Pdir) - continue - + else: + for matrix_file in misc.glob('matrix*orig.f', Pdir): + files.cp(matrix_file, matrix_file.replace('orig','optim')) + P_zero_result.append(Pdir) + if os.path.exists(pjoin(self.me_dir, 'error')): + os.remove(pjoin(self.me_dir, 'error')) + continue # bypass bad process + self.cmd.compile(['madevent_forhel'], cwd=Pdir) if not os.path.exists(pjoin(Pdir, 'madevent_forhel')): raise Exception('Error make madevent_forhel not successful') @@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True): #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts (stdout, _) = p.communicate(" ".encode()) stdout = stdout.decode('ascii',errors='ignore') - if os.path.exists(pjoin(self.me_dir,'error')): + if os.path.exists(pjoin(self.me_dir, 'error')): raise Exception(pjoin(self.me_dir,'error')) # note a continue is not enough here, we have in top to link # the matrixX_optim.f to matrixX_orig.f to let the code to work # after this error. + # for matrix_file in misc.glob('matrix*orig.f', Pdir): + # files.cp(matrix_file, matrix_file.replace('orig','optim')) if 'no events passed cuts' in stdout: raise Exception diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py index c9d1c7706a..0b849330ef 100644 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py @@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name): return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - + def plugin_input(self, finput): return @@ -79,7 +79,7 @@ def check_validity(self): self['sde_strategy'] = 1 if self['hel_recycling']: self['hel_recycling'] = False - + class GPURunCard(CPPRunCard): def default_setup(self): super(CPPRunCard, self).default_setup() diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py index d722702891..853aabc98a 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py @@ -1,4 +1,4 @@ -################################################################################ +############################################################################### # # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors # @@ -3675,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_combine_iteration(self, line): + def do_comine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line): ############################################################################ def do_combine_events(self, line): """Advanced commands: Launch combine events""" - + start=time.time() args = self.split_arg(line) start = time.time() # Check argument's validity @@ -3798,9 +3798,7 @@ def do_combine_events(self, line): self.correct_bias() elif self.run_card['custom_fcts']: self.correct_bias() - - logger.info("combine events done in %s", time.time()-start) - + logger.info("combination of events done in %s s ", time.time()-start) self.to_store.append('event') @@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done): import optparse # Get the directory of the script real path (bin) # and add it to the current PYTHONPATH - root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))) + #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))) sys.path.insert(0, root_path) class MyOptParser(optparse.OptionParser): @@ -7411,7 +7409,13 @@ def error(self, msg=''): import logging.config # Set logging level according to the logging level given by options #logging.basicConfig(level=vars(logging)[options.logging]) + import internal import internal.coloring_logging + # internal.file = XXX/bin/internal/__init__.py + # => need three dirname to get XXX + # we use internal to have any issue with pythonpath finding the wrong file + me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__))) + print("me_dir is", me_dir) try: if __debug__ and options.logging == 'INFO': options.logging = 'DEBUG' @@ -7419,7 +7423,8 @@ def error(self, msg=''): level = int(options.logging) else: level = eval('logging.' + options.logging) - logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf')) + log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf') + logging.config.fileConfig(log_path) logging.root.setLevel(level) logging.getLogger('madgraph').setLevel(level) except: @@ -7433,9 +7438,9 @@ def error(self, msg=''): if '--web' in args: i = args.index('--web') args.pop(i) - cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmd(me_dir, force_run=True) else: - cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmdShell(me_dir, force_run=True) if not hasattr(cmd_line, 'do_%s' % args[0]): if parser_error: print(parser_error) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py index d3fed3baa2..91cd3e5c22 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py @@ -347,7 +347,7 @@ def tell(msg): if dependency=='ninja': if cmd.options['ninja'] in ['None',None,''] or\ (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\ - which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None): + which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None): tell("Installing ninja...") cmd.do_install('ninja') diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py index c6d3948cc4..c344ea1b15 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py @@ -45,7 +45,9 @@ class ShowerCard(dict): false = ['.false.', 'f', 'false', '0'] logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', - 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td'] + 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td', + 'space_shower_me_corrections', 'time_shower_me_corrections', + 'time_shower_me_extended', 'time_shower_me_after_first'] string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse'] for i in range(1,100): string_vars.append('dm_'+str(i)) @@ -82,7 +84,11 @@ class ShowerCard(dict): 'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'}, 'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'}, 'qcut' : {'PYTHIA8':'qcut'}, - 'njmax' : {'PYTHIA8':'njmax'}} + 'njmax' : {'PYTHIA8':'njmax'}, + 'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'}, + 'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'}, + 'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'}, + 'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}} stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'} diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h index 9b946c21e1..8df465ad6d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc index 9d09eb6b62..64fc3fea62 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h index 6b32c66b9b..b6568d3761 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index b1a7fdc7e4..80631c94bf 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005366086959838867  +DEBUG: model prefixing takes 0.00567317008972168  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.161 s +1 processes with 123 diagrams generated in 0.157 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  -DEBUG: type(subproc_group)= [output.py at line 193]  -DEBUG: type(fortran_model)= [output.py at line 194]  -DEBUG: type(me)= me=0 [output.py at line 195]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  +DEBUG: type(subproc_group)= [output.py at line 190]  +DEBUG: type(fortran_model)= [output.py at line 191]  +DEBUG: type(me)= me=0 [output.py at line 192]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.431 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.325 s +ALOHA: aloha creates 5 routines in 0.318 s VVV1 VVV1 FFV1 @@ -204,9 +204,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m1.466s -user 0m1.388s -sys 0m0.064s +real 0m4.435s +user 0m1.373s +sys 0m0.056s diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index 927a19a802..204439a1dc 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h index d681eb7504..04f7c62976 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h index 9b946c21e1..8df465ad6d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc index 9d09eb6b62..64fc3fea62 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h index 6b32c66b9b..b6568d3761 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index af1d671efc..ab3974344c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005596160888671875  +DEBUG: model prefixing takes 0.005319833755493164  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.921 s +1 processes with 1240 diagrams generated in 1.855 s Total: 1 processes with 1240 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -192,23 +192,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.769 s -Wrote files for 2281 helas calls in 18.847 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.574 s +Wrote files for 2281 helas calls in 18.431 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.320 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +ALOHA: aloha creates 5 routines in 0.335 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.317 s +ALOHA: aloha creates 10 routines in 0.313 s VVV1 VVV1 FFV1 @@ -235,12 +235,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile patching file bin/internal/gen_ximprove.py +Hunk #1 succeeded at 391 (offset 6 lines). patching file bin/internal/madevent_interface.py DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file auto_dsig1.f @@ -250,16 +251,16 @@ Hunk #2 succeeded at 255 (offset 112 lines). Hunk #3 succeeded at 333 (offset 112 lines). Hunk #4 succeeded at 361 (offset 112 lines). Hunk #5 succeeded at 406 (offset 112 lines). -DEBUG: p.returncode =  0 [output.py at line 235]  +DEBUG: p.returncode =  0 [output.py at line 232]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m29.796s -user 0m29.282s -sys 0m0.413s +real 0m32.103s +user 0m28.586s +sys 0m0.412s ************************************************************ * * * W E L C O M E to * @@ -272,7 +273,7 @@ sys 0m0.413s * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -305,7 +306,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat index 05d11d495d..2f92ecc4ba 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.5.1_lo_vect 2023-08-08 * +#* VERSION 3.5.2_lo_vect 2023-11-08 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index a525c4ba3f..59033d7b2f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index dc41720ca6..2565923dde 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f index 2d3c5725be..d2a61fa2ac 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f index 51b8d47520..f22dfbf5e6 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f index ac5285eda5..41dbc97183 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -413,7 +413,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f index fe9c61504b..c00e33d954 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f @@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config) d1 = iforest(1, -i, config) d2 = iforest(2, -i, config) do j=0,3 - if (d1.gt.0.and.d1.le.2) then + if (d1.gt.0.and.d1.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1) endif - if (d2.gt.0.and.d2.le.2) then + if (d2.gt.0.and.d2.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py index 16e60d8182..0d17042f0d 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py @@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error): import os import logging import time +pjoin = os.path.join #Look for basic file position MG5DIR and MG4DIR MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py index f0d38c2e5a..3995ce8109 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py @@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): continue break + if proc_characteristic['ninitial'] == 1: + self['SDE_strategy'] =1 + if 'MLM' in proc_characteristic['limitations']: if self['dynamical_scale_choice'] == -1: self['dynamical_scale_choice'] = 3 @@ -5942,7 +5945,7 @@ def default_setup(self): self.add_param("CheckCycle", 3) self.add_param("MaxAttempts", 10) self.add_param("ZeroThres", 1e-9) - self.add_param("OSThres", 1.0e-13) + self.add_param("OSThres", 1.0e-8) self.add_param("DoubleCheckHelicityFilter", True) self.add_param("WriteOutFilters", True) self.add_param("UseLoopFilter", False) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py index 14c7f310dc..87cb4b88df 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py @@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto self.load_default() self.define_paths(**opt) self.last_editline_pos = 0 + self.update_dependent_done = False if 'allow_arg' not in opt or not opt['allow_arg']: # add some mininal content for this: @@ -6585,7 +6586,9 @@ def postcmd(self, stop, line): self.check_card_consistency() if self.param_consistency: try: - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) + self.update_dependent_done = False except MadGraph5Error as error: if 'Missing block:' in str(error): self.fail_due_to_format +=1 @@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0): self.update_dependent(self.mother_interface, self.me_dir, self.param_card, self.paths['param'], timer, run_card=self.run_card, lhapdfconfig=self.lhapdf) + self.update_dependent_done = True + elif args[0] == 'missing': self.update_missing() @@ -6717,12 +6722,13 @@ class TimeOutError(Exception): def handle_alarm(signum, frame): raise TimeOutError signal.signal(signal.SIGALRM, handle_alarm) + if timer: - signal.alarm(timer) log_level=30 else: log_level=20 + if run_card: as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, @@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame): logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s", as_for_pdf[pdlabel]) modify = True + if timer: + signal.alarm(timer) + + # Try to load the model in the limited amount of time allowed try: model = mecmd.get_model() @@ -6909,7 +6919,8 @@ def check_block(self, blockname): def check_answer_consistency(self): """function called if the code reads a file""" self.check_card_consistency() - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) def help_set(self): '''help message for set''' diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py index a88d60b282..5fd170d18d 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py @@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True): (stdout, _) = p.communicate(''.encode()) stdout = stdout.decode('ascii',errors='ignore') - try: + if stdout: nb_channel = max([math.floor(float(d)) for d in stdout.split()]) - except Exception as error: - misc.sprint(stdout, 'no channel or error for %s' % Pdir) - continue - + else: + for matrix_file in misc.glob('matrix*orig.f', Pdir): + files.cp(matrix_file, matrix_file.replace('orig','optim')) + P_zero_result.append(Pdir) + if os.path.exists(pjoin(self.me_dir, 'error')): + os.remove(pjoin(self.me_dir, 'error')) + continue # bypass bad process + self.cmd.compile(['madevent_forhel'], cwd=Pdir) if not os.path.exists(pjoin(Pdir, 'madevent_forhel')): raise Exception('Error make madevent_forhel not successful') @@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True): #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts (stdout, _) = p.communicate(" ".encode()) stdout = stdout.decode('ascii',errors='ignore') - if os.path.exists(pjoin(self.me_dir,'error')): + if os.path.exists(pjoin(self.me_dir, 'error')): raise Exception(pjoin(self.me_dir,'error')) # note a continue is not enough here, we have in top to link # the matrixX_optim.f to matrixX_orig.f to let the code to work # after this error. + # for matrix_file in misc.glob('matrix*orig.f', Pdir): + # files.cp(matrix_file, matrix_file.replace('orig','optim')) if 'no events passed cuts' in stdout: raise Exception diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py index c9d1c7706a..0b849330ef 100644 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py @@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name): return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - + def plugin_input(self, finput): return @@ -79,7 +79,7 @@ def check_validity(self): self['sde_strategy'] = 1 if self['hel_recycling']: self['hel_recycling'] = False - + class GPURunCard(CPPRunCard): def default_setup(self): super(CPPRunCard, self).default_setup() diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py index d722702891..853aabc98a 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py @@ -1,4 +1,4 @@ -################################################################################ +############################################################################### # # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors # @@ -3675,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_combine_iteration(self, line): + def do_comine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line): ############################################################################ def do_combine_events(self, line): """Advanced commands: Launch combine events""" - + start=time.time() args = self.split_arg(line) start = time.time() # Check argument's validity @@ -3798,9 +3798,7 @@ def do_combine_events(self, line): self.correct_bias() elif self.run_card['custom_fcts']: self.correct_bias() - - logger.info("combine events done in %s", time.time()-start) - + logger.info("combination of events done in %s s ", time.time()-start) self.to_store.append('event') @@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done): import optparse # Get the directory of the script real path (bin) # and add it to the current PYTHONPATH - root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))) + #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))) sys.path.insert(0, root_path) class MyOptParser(optparse.OptionParser): @@ -7411,7 +7409,13 @@ def error(self, msg=''): import logging.config # Set logging level according to the logging level given by options #logging.basicConfig(level=vars(logging)[options.logging]) + import internal import internal.coloring_logging + # internal.file = XXX/bin/internal/__init__.py + # => need three dirname to get XXX + # we use internal to have any issue with pythonpath finding the wrong file + me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__))) + print("me_dir is", me_dir) try: if __debug__ and options.logging == 'INFO': options.logging = 'DEBUG' @@ -7419,7 +7423,8 @@ def error(self, msg=''): level = int(options.logging) else: level = eval('logging.' + options.logging) - logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf')) + log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf') + logging.config.fileConfig(log_path) logging.root.setLevel(level) logging.getLogger('madgraph').setLevel(level) except: @@ -7433,9 +7438,9 @@ def error(self, msg=''): if '--web' in args: i = args.index('--web') args.pop(i) - cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmd(me_dir, force_run=True) else: - cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmdShell(me_dir, force_run=True) if not hasattr(cmd_line, 'do_%s' % args[0]): if parser_error: print(parser_error) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py index d3fed3baa2..91cd3e5c22 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py @@ -347,7 +347,7 @@ def tell(msg): if dependency=='ninja': if cmd.options['ninja'] in ['None',None,''] or\ (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\ - which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None): + which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None): tell("Installing ninja...") cmd.do_install('ninja') diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py index c6d3948cc4..c344ea1b15 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py @@ -45,7 +45,9 @@ class ShowerCard(dict): false = ['.false.', 'f', 'false', '0'] logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', - 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td'] + 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td', + 'space_shower_me_corrections', 'time_shower_me_corrections', + 'time_shower_me_extended', 'time_shower_me_after_first'] string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse'] for i in range(1,100): string_vars.append('dm_'+str(i)) @@ -82,7 +84,11 @@ class ShowerCard(dict): 'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'}, 'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'}, 'qcut' : {'PYTHIA8':'qcut'}, - 'njmax' : {'PYTHIA8':'njmax'}} + 'njmax' : {'PYTHIA8':'njmax'}, + 'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'}, + 'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'}, + 'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'}, + 'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}} stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'} diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h index 9b946c21e1..8df465ad6d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc index 9d09eb6b62..64fc3fea62 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h index 6b32c66b9b..b6568d3761 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 73a2d9596c..33bae20142 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00565791130065918  +DEBUG: model prefixing takes 0.005532503128051758  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,28 +155,28 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.891 s +1 processes with 1240 diagrams generated in 1.880 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  -DEBUG: type(subproc_group)= [output.py at line 193]  -DEBUG: type(fortran_model)= [output.py at line 194]  -DEBUG: type(me)= me=0 [output.py at line 195]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  +DEBUG: type(subproc_group)= [output.py at line 190]  +DEBUG: type(fortran_model)= [output.py at line 191]  +DEBUG: type(me)= me=0 [output.py at line 192]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.621 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.540 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines @@ -204,9 +204,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m13.161s -user 0m12.961s -sys 0m0.105s +real 0m15.959s +user 0m12.810s +sys 0m0.102s diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index a67b74e5b7..30acce4afc 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h index dc41720ca6..2565923dde 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h index 9b946c21e1..8df465ad6d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc index 9d09eb6b62..64fc3fea62 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h index 6b32c66b9b..b6568d3761 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 3fcb694ccd..89cb2749b0 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005532264709472656  +DEBUG: model prefixing takes 0.0057373046875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,17 +170,17 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.079 s +8 processes with 40 diagrams generated in 0.078 s Total: 8 processes with 40 diagrams output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -231,16 +231,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s -Wrote files for 32 helas calls in 0.224 s +Wrote files for 32 helas calls in 0.217 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.147 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +ALOHA: aloha creates 2 routines in 0.144 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.133 s +ALOHA: aloha creates 4 routines in 0.132 s FFV1 FFV1 FFV1 @@ -260,12 +260,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile patching file bin/internal/gen_ximprove.py +Hunk #1 succeeded at 391 (offset 6 lines). patching file bin/internal/madevent_interface.py DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file auto_dsig1.f @@ -287,15 +288,15 @@ Hunk #2 succeeded at 162 (offset 19 lines). Hunk #3 succeeded at 247 (offset 26 lines). Hunk #4 succeeded at 281 (offset 32 lines). Hunk #5 succeeded at 326 (offset 32 lines). -DEBUG: p.returncode =  0 [output.py at line 235]  +DEBUG: p.returncode =  0 [output.py at line 232]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m1.962s -user 0m1.726s +real 0m4.915s +user 0m1.680s sys 0m0.237s ************************************************************ * * @@ -309,7 +310,7 @@ sys 0m0.237s * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -342,7 +343,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat index dc07af3836..efb0752a31 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.5.1_lo_vect 2023-08-08 * +#* VERSION 3.5.2_lo_vect 2023-11-08 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index c526dd6b31..649c608210 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index cdc2dc91ac..bf037c6c28 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f index 249a3e4e3c..6c1667bc0f 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f index ba39cab867..ee1484ab56 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f index e6d01dad0b..bd8e2f143a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -333,7 +333,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 8d92e4e769..930da28159 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index a90abc4ab4..0f49f5247b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f index f2eba72de7..c9b8759b60 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f index 5ec9701b78..62c235de64 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f index 7a2e329e64..4c05be74a0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -333,7 +333,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f index fe9c61504b..c00e33d954 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f @@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config) d1 = iforest(1, -i, config) d2 = iforest(2, -i, config) do j=0,3 - if (d1.gt.0.and.d1.le.2) then + if (d1.gt.0.and.d1.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1) endif - if (d2.gt.0.and.d2.le.2) then + if (d2.gt.0.and.d2.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py index 16e60d8182..0d17042f0d 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py @@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error): import os import logging import time +pjoin = os.path.join #Look for basic file position MG5DIR and MG4DIR MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py index f0d38c2e5a..3995ce8109 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py @@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): continue break + if proc_characteristic['ninitial'] == 1: + self['SDE_strategy'] =1 + if 'MLM' in proc_characteristic['limitations']: if self['dynamical_scale_choice'] == -1: self['dynamical_scale_choice'] = 3 @@ -5942,7 +5945,7 @@ def default_setup(self): self.add_param("CheckCycle", 3) self.add_param("MaxAttempts", 10) self.add_param("ZeroThres", 1e-9) - self.add_param("OSThres", 1.0e-13) + self.add_param("OSThres", 1.0e-8) self.add_param("DoubleCheckHelicityFilter", True) self.add_param("WriteOutFilters", True) self.add_param("UseLoopFilter", False) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py index 14c7f310dc..87cb4b88df 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py @@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto self.load_default() self.define_paths(**opt) self.last_editline_pos = 0 + self.update_dependent_done = False if 'allow_arg' not in opt or not opt['allow_arg']: # add some mininal content for this: @@ -6585,7 +6586,9 @@ def postcmd(self, stop, line): self.check_card_consistency() if self.param_consistency: try: - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) + self.update_dependent_done = False except MadGraph5Error as error: if 'Missing block:' in str(error): self.fail_due_to_format +=1 @@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0): self.update_dependent(self.mother_interface, self.me_dir, self.param_card, self.paths['param'], timer, run_card=self.run_card, lhapdfconfig=self.lhapdf) + self.update_dependent_done = True + elif args[0] == 'missing': self.update_missing() @@ -6717,12 +6722,13 @@ class TimeOutError(Exception): def handle_alarm(signum, frame): raise TimeOutError signal.signal(signal.SIGALRM, handle_alarm) + if timer: - signal.alarm(timer) log_level=30 else: log_level=20 + if run_card: as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, @@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame): logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s", as_for_pdf[pdlabel]) modify = True + if timer: + signal.alarm(timer) + + # Try to load the model in the limited amount of time allowed try: model = mecmd.get_model() @@ -6909,7 +6919,8 @@ def check_block(self, blockname): def check_answer_consistency(self): """function called if the code reads a file""" self.check_card_consistency() - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) def help_set(self): '''help message for set''' diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py index a88d60b282..5fd170d18d 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py @@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True): (stdout, _) = p.communicate(''.encode()) stdout = stdout.decode('ascii',errors='ignore') - try: + if stdout: nb_channel = max([math.floor(float(d)) for d in stdout.split()]) - except Exception as error: - misc.sprint(stdout, 'no channel or error for %s' % Pdir) - continue - + else: + for matrix_file in misc.glob('matrix*orig.f', Pdir): + files.cp(matrix_file, matrix_file.replace('orig','optim')) + P_zero_result.append(Pdir) + if os.path.exists(pjoin(self.me_dir, 'error')): + os.remove(pjoin(self.me_dir, 'error')) + continue # bypass bad process + self.cmd.compile(['madevent_forhel'], cwd=Pdir) if not os.path.exists(pjoin(Pdir, 'madevent_forhel')): raise Exception('Error make madevent_forhel not successful') @@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True): #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts (stdout, _) = p.communicate(" ".encode()) stdout = stdout.decode('ascii',errors='ignore') - if os.path.exists(pjoin(self.me_dir,'error')): + if os.path.exists(pjoin(self.me_dir, 'error')): raise Exception(pjoin(self.me_dir,'error')) # note a continue is not enough here, we have in top to link # the matrixX_optim.f to matrixX_orig.f to let the code to work # after this error. + # for matrix_file in misc.glob('matrix*orig.f', Pdir): + # files.cp(matrix_file, matrix_file.replace('orig','optim')) if 'no events passed cuts' in stdout: raise Exception diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py index c9d1c7706a..0b849330ef 100644 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py @@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name): return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - + def plugin_input(self, finput): return @@ -79,7 +79,7 @@ def check_validity(self): self['sde_strategy'] = 1 if self['hel_recycling']: self['hel_recycling'] = False - + class GPURunCard(CPPRunCard): def default_setup(self): super(CPPRunCard, self).default_setup() diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py index d722702891..853aabc98a 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py @@ -1,4 +1,4 @@ -################################################################################ +############################################################################### # # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors # @@ -3675,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_combine_iteration(self, line): + def do_comine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line): ############################################################################ def do_combine_events(self, line): """Advanced commands: Launch combine events""" - + start=time.time() args = self.split_arg(line) start = time.time() # Check argument's validity @@ -3798,9 +3798,7 @@ def do_combine_events(self, line): self.correct_bias() elif self.run_card['custom_fcts']: self.correct_bias() - - logger.info("combine events done in %s", time.time()-start) - + logger.info("combination of events done in %s s ", time.time()-start) self.to_store.append('event') @@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done): import optparse # Get the directory of the script real path (bin) # and add it to the current PYTHONPATH - root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))) + #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))) sys.path.insert(0, root_path) class MyOptParser(optparse.OptionParser): @@ -7411,7 +7409,13 @@ def error(self, msg=''): import logging.config # Set logging level according to the logging level given by options #logging.basicConfig(level=vars(logging)[options.logging]) + import internal import internal.coloring_logging + # internal.file = XXX/bin/internal/__init__.py + # => need three dirname to get XXX + # we use internal to have any issue with pythonpath finding the wrong file + me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__))) + print("me_dir is", me_dir) try: if __debug__ and options.logging == 'INFO': options.logging = 'DEBUG' @@ -7419,7 +7423,8 @@ def error(self, msg=''): level = int(options.logging) else: level = eval('logging.' + options.logging) - logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf')) + log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf') + logging.config.fileConfig(log_path) logging.root.setLevel(level) logging.getLogger('madgraph').setLevel(level) except: @@ -7433,9 +7438,9 @@ def error(self, msg=''): if '--web' in args: i = args.index('--web') args.pop(i) - cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmd(me_dir, force_run=True) else: - cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmdShell(me_dir, force_run=True) if not hasattr(cmd_line, 'do_%s' % args[0]): if parser_error: print(parser_error) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py index d3fed3baa2..91cd3e5c22 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py @@ -347,7 +347,7 @@ def tell(msg): if dependency=='ninja': if cmd.options['ninja'] in ['None',None,''] or\ (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\ - which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None): + which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None): tell("Installing ninja...") cmd.do_install('ninja') diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py index c6d3948cc4..c344ea1b15 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py @@ -45,7 +45,9 @@ class ShowerCard(dict): false = ['.false.', 'f', 'false', '0'] logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', - 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td'] + 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td', + 'space_shower_me_corrections', 'time_shower_me_corrections', + 'time_shower_me_extended', 'time_shower_me_after_first'] string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse'] for i in range(1,100): string_vars.append('dm_'+str(i)) @@ -82,7 +84,11 @@ class ShowerCard(dict): 'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'}, 'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'}, 'qcut' : {'PYTHIA8':'qcut'}, - 'njmax' : {'PYTHIA8':'njmax'}} + 'njmax' : {'PYTHIA8':'njmax'}, + 'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'}, + 'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'}, + 'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'}, + 'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}} stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'} diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h index 0dd5f20f71..cd4e6de668 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc index d5eda63ee0..c06dcbb252 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h index 0c77cf58f0..a6eb185434 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 06d5354735..16374bd28e 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0056154727935791016  +DEBUG: model prefixing takes 0.005791902542114258  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,14 +170,14 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.080 s +8 processes with 40 diagrams generated in 0.078 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 @@ -190,28 +190,28 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  -DEBUG: type(subproc_group)= [output.py at line 193]  -DEBUG: type(fortran_model)= [output.py at line 194]  -DEBUG: type(me)= me=0 [output.py at line 195]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  +DEBUG: type(subproc_group)= [output.py at line 190]  +DEBUG: type(fortran_model)= [output.py at line 191]  +DEBUG: type(me)= me=0 [output.py at line 192]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  -DEBUG: type(subproc_group)= [output.py at line 193]  -DEBUG: type(fortran_model)= [output.py at line 194]  -DEBUG: type(me)= me=1 [output.py at line 195]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  +DEBUG: type(subproc_group)= [output.py at line 190]  +DEBUG: type(fortran_model)= [output.py at line 191]  +DEBUG: type(me)= me=1 [output.py at line 192]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.146 s +ALOHA: aloha creates 2 routines in 0.144 s FFV1 FFV1 FFV1 @@ -225,9 +225,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m0.655s -user 0m0.595s -sys 0m0.055s +real 0m3.656s +user 0m0.594s +sys 0m0.059s diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index 037662f7db..4965f393c5 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h index cdc2dc91ac..bf037c6c28 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index 12179b9801..5024e8e239 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h index a90abc4ab4..0f49f5247b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h index 0dd5f20f71..cd4e6de668 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc index d5eda63ee0..c06dcbb252 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h index 0c77cf58f0..a6eb185434 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 645c0db954..3b04fc3fb3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -135,22 +135,22 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_h Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 192]  -DEBUG: type(subproc_group)= [output.py at line 193]  -DEBUG: type(fortran_model)= [output.py at line 194]  -DEBUG: type(me)= me=0 [output.py at line 195]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  +DEBUG: type(subproc_group)= [output.py at line 190]  +DEBUG: type(fortran_model)= [output.py at line 191]  +DEBUG: type(me)= me=0 [output.py at line 192]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates 1 routines in 0.062 s @@ -163,9 +163,9 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m0.430s +real 0m3.422s user 0m0.371s -sys 0m0.055s +sys 0m0.048s diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc index 6cc0be1461..1d59f8e3cf 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h index d0312182d5..dbc5aa0e4e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h index a2e9b6a70c..eae9ff5242 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc index fde65d5571..e5442756b1 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h index d1a451b2c3..790485fee0 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 1d0d9e2a35..8b6ca99446 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -14,7 +14,7 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect 2023-08-08 * +* VERSION 3.5.2_lo_vect 2023-11-08 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005470752716064453  +DEBUG: model prefixing takes 0.00538325309753418  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.137 s +13 processes with 76 diagrams generated in 0.134 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,17 +378,17 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.856 s +65 processes with 1119 diagrams generated in 1.811 s Total: 83 processes with 1202 diagrams output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 158]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 163]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  @@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6240]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -801,23 +801,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.304 s -Wrote files for 810 helas calls in 3.574 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.267 s +Wrote files for 810 helas calls in 3.215 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.355 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 200]  +ALOHA: aloha creates 5 routines in 0.333 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.318 s +ALOHA: aloha creates 10 routines in 0.312 s VVV1 VVV1 FFV1 @@ -844,12 +844,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 209]  +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile patching file SubProcesses/makefile patching file bin/internal/gen_ximprove.py +Hunk #1 succeeded at 391 (offset 6 lines). patching file bin/internal/madevent_interface.py DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file auto_dsig1.f @@ -1021,16 +1022,16 @@ Hunk #2 succeeded at 194 (offset 51 lines). Hunk #3 succeeded at 272 (offset 51 lines). Hunk #4 succeeded at 300 (offset 51 lines). Hunk #5 succeeded at 345 (offset 51 lines). -DEBUG: p.returncode =  0 [output.py at line 235]  +DEBUG: p.returncode =  0 [output.py at line 232]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m9.272s -user 0m8.475s -sys 0m0.501s +real 0m11.764s +user 0m8.242s +sys 0m0.480s ************************************************************ * * * W E L C O M E to * @@ -1043,7 +1044,7 @@ sys 0m0.501s * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -1076,7 +1077,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.5.1_lo_vect * +* VERSION 3.5.2_lo_vect * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat index 944298ae75..c0b1a2fd98 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.5.1_lo_vect 2023-08-08 * +#* VERSION 3.5.2_lo_vect 2023-11-08 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt +++ b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt index 1c1a95761b..85c67c3554 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.5.1_lo_vect \ No newline at end of file +3.5.2_lo_vect \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index 0317bbc95a..30815cd085 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h index ecd2d1364e..448175be9d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f index dce732e252..963d8ec072 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f index a48f6997f3..d4e2956b18 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f index d803e4f19f..5b3b723e59 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -301,7 +301,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index 75110e8fec..fa46e42b8f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h index 3d5ca9d556..e166fa1652 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f index 3d59efb411..2cc5a2026a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f index f9147f699e..2344ddbe81 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f index 4c21758744..1dea73e826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -304,7 +304,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index f7f5899260..5e2bf0d19a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 9f559fe3ae..37d6ebe981 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f index d528b1d2f0..dd4cd3a0c2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f index 110e204c24..e28575ead8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f index bf665ff6e0..a885b7fde3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -317,7 +317,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 90a457ac40..3b6b1a6c16 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index cdc2dc91ac..bf037c6c28 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f index 249a3e4e3c..6c1667bc0f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f index ba39cab867..ee1484ab56 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f index d61f0e1a21..b7d8649204 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 9a73b3ed94..eb62f13990 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index a90abc4ab4..0f49f5247b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f index f2eba72de7..c9b8759b60 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f index 5ec9701b78..62c235de64 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f index b082becd2a..8a699645cd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index dc1a3e9d26..c47ef64ec8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h index 06af307caa..f8bdb38aee 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f index 408403e5d9..628e0d8092 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f index 842b1c72d4..b66a887225 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f index 265f6006db..7bc63ee8a4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index cbc45ff652..0cbb15fba7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h index a41aa7611a..9f43559181 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f index c23550e9b7..84ee7e5b85 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f index 4e2bfe85ab..aa73f64dba 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f index c8fbb1cc8b..46e6ff0da7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -349,7 +349,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index 5723ed5665..d9f2d09952 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h index 95f4bf6912..f26b60c5bb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f index d196e8ed65..abb75a925b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f index e5a0390c47..d6bf2155ff 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -228,7 +228,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f index 4f966fab6d..fabc6786d3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index b8f74ecafe..0d1c319939 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h index a54b0bb8fe..853175b477 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f index bc732da055..94fe1937c3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f index 309be94a99..50c024adc3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f index c03cebacb0..210884dccf 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index 2495941a73..8e3985f427 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h index d31dd972a9..e60cb5b6d7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f index 399b68be58..3e0e30af23 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f index 23d82657bf..e639ee4c34 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f index 39422dc34c..a8c5f11ae3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index 529477ff3e..22398e7ab4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h index 4f557f24ab..5329710b87 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f index da207359fc..94cfdd1487 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f index 4d12dfeade..37f4a35577 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -240,7 +240,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f index 9e27e48c99..66b1820c10 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -354,7 +354,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index e54a24ea57..3955de70dd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h index 1818cf79ed..391789dc81 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f index cfd6a270b5..5ce83d5f12 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f index 5bac32b00a..ea0697602c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -266,7 +266,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f index 6bdc5db576..9403b67a1a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -360,7 +360,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 8638bbefa2..bfc3d0809f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h index 41e15f6ad0..2d95f4b170 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f index efdae70d19..44e8c9d920 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f index 50c16edaac..302d0eda9c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f index 8b2cf62531..f51744ae5d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index c071cc6900..222800dcfd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h index b93bb3909d..14490d782f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f index 72e76f54e4..ab270fe554 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f index 577a8d9c54..e9b4ddc613 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -266,7 +266,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f index c5a7b6787c..f93b850d5f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -360,7 +360,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 2eb6b491fa..ef9407041b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h index 2f4866b6ca..1543c29649 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f index 4b08b69f90..f5ef1f7b43 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f index f4e431c5ce..83e40fb02c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f index a843f4656a..9996fdea2d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 8682128442..1aa88699db 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h index dbd5b60487..58cece5c62 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f index 3e29e25982..867eb95566 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f index 123a3ae00e..ae43656176 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f index 6d8f6b4ed8..205e3daf83 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index 7d3141cfc4..5f356a519e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h index f92e527895..6bd3135c3c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f index 44da6cd9ce..8ded31027d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f index a4cb748b19..7ce014f5f5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -240,7 +240,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f index 53f591633e..dfbec413a8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -354,7 +354,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index 6ec302f68b..af04d58c3e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -7,7 +7,7 @@ // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h index 53c3b7149b..4e53fa1250 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h @@ -7,7 +7,7 @@ // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f index 43ccdff1e1..2acdc960db 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f @@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f index 3a3ed05151..115e19c70e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f index dce10b9553..392b30a39f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +C Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f index fe9c61504b..c00e33d954 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f @@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config) d1 = iforest(1, -i, config) d2 = iforest(2, -i, config) do j=0,3 - if (d1.gt.0.and.d1.le.2) then + if (d1.gt.0.and.d1.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1) endif - if (d2.gt.0.and.d2.le.2) then + if (d2.gt.0.and.d2.le.nincoming) then ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2) else ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py index 16e60d8182..0d17042f0d 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py @@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error): import os import logging import time +pjoin = os.path.join #Look for basic file position MG5DIR and MG4DIR MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py index f0d38c2e5a..3995ce8109 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py @@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): continue break + if proc_characteristic['ninitial'] == 1: + self['SDE_strategy'] =1 + if 'MLM' in proc_characteristic['limitations']: if self['dynamical_scale_choice'] == -1: self['dynamical_scale_choice'] = 3 @@ -5942,7 +5945,7 @@ def default_setup(self): self.add_param("CheckCycle", 3) self.add_param("MaxAttempts", 10) self.add_param("ZeroThres", 1e-9) - self.add_param("OSThres", 1.0e-13) + self.add_param("OSThres", 1.0e-8) self.add_param("DoubleCheckHelicityFilter", True) self.add_param("WriteOutFilters", True) self.add_param("UseLoopFilter", False) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py index 14c7f310dc..87cb4b88df 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py @@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto self.load_default() self.define_paths(**opt) self.last_editline_pos = 0 + self.update_dependent_done = False if 'allow_arg' not in opt or not opt['allow_arg']: # add some mininal content for this: @@ -6585,7 +6586,9 @@ def postcmd(self, stop, line): self.check_card_consistency() if self.param_consistency: try: - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) + self.update_dependent_done = False except MadGraph5Error as error: if 'Missing block:' in str(error): self.fail_due_to_format +=1 @@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0): self.update_dependent(self.mother_interface, self.me_dir, self.param_card, self.paths['param'], timer, run_card=self.run_card, lhapdfconfig=self.lhapdf) + self.update_dependent_done = True + elif args[0] == 'missing': self.update_missing() @@ -6717,12 +6722,13 @@ class TimeOutError(Exception): def handle_alarm(signum, frame): raise TimeOutError signal.signal(signal.SIGALRM, handle_alarm) + if timer: - signal.alarm(timer) log_level=30 else: log_level=20 + if run_card: as_for_pdf = {'cteq6_m': 0.118, 'cteq6_d': 0.118, @@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame): logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s", as_for_pdf[pdlabel]) modify = True + if timer: + signal.alarm(timer) + + # Try to load the model in the limited amount of time allowed try: model = mecmd.get_model() @@ -6909,7 +6919,8 @@ def check_block(self, blockname): def check_answer_consistency(self): """function called if the code reads a file""" self.check_card_consistency() - self.do_update('dependent', timer=20) + if not self.update_dependent_done: + self.do_update('dependent', timer=20) def help_set(self): '''help message for set''' diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py index a88d60b282..5fd170d18d 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py @@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True): (stdout, _) = p.communicate(''.encode()) stdout = stdout.decode('ascii',errors='ignore') - try: + if stdout: nb_channel = max([math.floor(float(d)) for d in stdout.split()]) - except Exception as error: - misc.sprint(stdout, 'no channel or error for %s' % Pdir) - continue - + else: + for matrix_file in misc.glob('matrix*orig.f', Pdir): + files.cp(matrix_file, matrix_file.replace('orig','optim')) + P_zero_result.append(Pdir) + if os.path.exists(pjoin(self.me_dir, 'error')): + os.remove(pjoin(self.me_dir, 'error')) + continue # bypass bad process + self.cmd.compile(['madevent_forhel'], cwd=Pdir) if not os.path.exists(pjoin(Pdir, 'madevent_forhel')): raise Exception('Error make madevent_forhel not successful') @@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True): #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts (stdout, _) = p.communicate(" ".encode()) stdout = stdout.decode('ascii',errors='ignore') - if os.path.exists(pjoin(self.me_dir,'error')): + if os.path.exists(pjoin(self.me_dir, 'error')): raise Exception(pjoin(self.me_dir,'error')) # note a continue is not enough here, we have in top to link # the matrixX_optim.f to matrixX_orig.f to let the code to work # after this error. + # for matrix_file in misc.glob('matrix*orig.f', Pdir): + # files.cp(matrix_file, matrix_file.replace('orig','optim')) if 'no events passed cuts' in stdout: raise Exception diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py index c9d1c7706a..0b849330ef 100644 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py @@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name): return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - + def plugin_input(self, finput): return @@ -79,7 +79,7 @@ def check_validity(self): self['sde_strategy'] = 1 if self['hel_recycling']: self['hel_recycling'] = False - + class GPURunCard(CPPRunCard): def default_setup(self): super(CPPRunCard, self).default_setup() diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py index d722702891..853aabc98a 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py @@ -1,4 +1,4 @@ -################################################################################ +############################################################################### # # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors # @@ -3675,7 +3675,7 @@ def do_refine(self, line): devnull.close() ############################################################################ - def do_combine_iteration(self, line): + def do_comine_iteration(self, line): """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step S is for survey R is for refine @@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line): ############################################################################ def do_combine_events(self, line): """Advanced commands: Launch combine events""" - + start=time.time() args = self.split_arg(line) start = time.time() # Check argument's validity @@ -3798,9 +3798,7 @@ def do_combine_events(self, line): self.correct_bias() elif self.run_card['custom_fcts']: self.correct_bias() - - logger.info("combine events done in %s", time.time()-start) - + logger.info("combination of events done in %s s ", time.time()-start) self.to_store.append('event') @@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done): import optparse # Get the directory of the script real path (bin) # and add it to the current PYTHONPATH - root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))) + #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))) sys.path.insert(0, root_path) class MyOptParser(optparse.OptionParser): @@ -7411,7 +7409,13 @@ def error(self, msg=''): import logging.config # Set logging level according to the logging level given by options #logging.basicConfig(level=vars(logging)[options.logging]) + import internal import internal.coloring_logging + # internal.file = XXX/bin/internal/__init__.py + # => need three dirname to get XXX + # we use internal to have any issue with pythonpath finding the wrong file + me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__))) + print("me_dir is", me_dir) try: if __debug__ and options.logging == 'INFO': options.logging = 'DEBUG' @@ -7419,7 +7423,8 @@ def error(self, msg=''): level = int(options.logging) else: level = eval('logging.' + options.logging) - logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf')) + log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf') + logging.config.fileConfig(log_path) logging.root.setLevel(level) logging.getLogger('madgraph').setLevel(level) except: @@ -7433,9 +7438,9 @@ def error(self, msg=''): if '--web' in args: i = args.index('--web') args.pop(i) - cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmd(me_dir, force_run=True) else: - cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True) + cmd_line = MadEventCmdShell(me_dir, force_run=True) if not hasattr(cmd_line, 'do_%s' % args[0]): if parser_error: print(parser_error) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py index d3fed3baa2..91cd3e5c22 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py @@ -347,7 +347,7 @@ def tell(msg): if dependency=='ninja': if cmd.options['ninja'] in ['None',None,''] or\ (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\ - which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None): + which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None): tell("Installing ninja...") cmd.do_install('ninja') diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py index c6d3948cc4..c344ea1b15 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py @@ -45,7 +45,9 @@ class ShowerCard(dict): false = ['.false.', 'f', 'false', '0'] logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', - 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td'] + 'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td', + 'space_shower_me_corrections', 'time_shower_me_corrections', + 'time_shower_me_extended', 'time_shower_me_after_first'] string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse'] for i in range(1,100): string_vars.append('dm_'+str(i)) @@ -82,7 +84,11 @@ class ShowerCard(dict): 'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'}, 'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'}, 'qcut' : {'PYTHIA8':'qcut'}, - 'njmax' : {'PYTHIA8':'njmax'}} + 'njmax' : {'PYTHIA8':'njmax'}, + 'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'}, + 'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'}, + 'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'}, + 'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}} stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'} diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h index 9b946c21e1..8df465ad6d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h @@ -8,7 +8,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc index 9d09eb6b62..64fc3fea62 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h index 6b32c66b9b..b6568d3761 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h @@ -7,7 +7,7 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08 +// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== From 381424b526a407732c63fc87fdc497989ba3931b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 10 Nov 2023 06:37:21 +0100 Subject: [PATCH 13/14] [gpucpp] rerun 78 tput tests, with FPEs enabled, after the upgrade to 3.5.2 - usual failures in ggttg f/m and gqttq f (#783), no change in performance STARTED AT Thu Nov 9 05:26:21 PM CET 2023 ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Thu Nov 9 05:54:46 PM CET 2023 [Status=2] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Thu Nov 9 06:05:38 PM CET 2023 [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Thu Nov 9 06:15:05 PM CET 2023 [Status=2] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Thu Nov 9 06:18:20 PM CET 2023 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Thu Nov 9 06:21:32 PM CET 2023 [Status=0] --- .../log_eemumu_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl1_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl1_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl1_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl1_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_m_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl1_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl1_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl1_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl1_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_m_inl0_hrd1.txt | 86 +++++++-------- .../log_ggttg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd0.txt | 36 +++---- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 36 +++---- .../log_ggttg_mad_f_inl0_hrd1.txt | 36 +++---- .../log_ggttg_mad_m_inl0_hrd0.txt | 36 +++---- .../log_ggttg_mad_m_inl0_hrd1.txt | 36 +++---- .../log_ggttgg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_gqttq_mad_f_inl0_hrd0.txt | 92 ++++++++-------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 92 ++++++++-------- .../log_gqttq_mad_f_inl0_hrd1.txt | 92 ++++++++-------- .../log_gqttq_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_gqttq_mad_m_inl0_hrd1.txt | 100 +++++++++--------- 78 files changed, 3476 insertions(+), 3476 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 4f18003d70..96be4f25ce 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:15:12 +DATE: 2023-11-09_17:36:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.482370e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.785159e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.963951e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.632744e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.846433e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.013402e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.677103 sec - 2,617,238,862 cycles # 2.883 GHz - 4,033,048,225 instructions # 1.54 insn per cycle - 0.968798898 seconds time elapsed +TOTAL : 0.666402 sec + 2,677,197,972 cycles # 3.012 GHz + 4,052,373,824 instructions # 1.51 insn per cycle + 0.957128261 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.115937e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.309320e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.309320e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.129159e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.324668e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.324668e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.040810 sec - 18,355,110,031 cycles # 3.037 GHz - 44,036,146,715 instructions # 2.40 insn per cycle - 6.046149721 seconds time elapsed +TOTAL : 5.970581 sec + 18,294,560,469 cycles # 3.063 GHz + 44,035,841,714 instructions # 2.41 insn per cycle + 5.975709847 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.614682e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.109953e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.109953e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.674808e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.201099e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.201099e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.305087 sec - 12,797,655,048 cycles # 2.970 GHz - 31,002,550,325 instructions # 2.42 insn per cycle - 4.310429047 seconds time elapsed +TOTAL : 4.151985 sec + 12,801,375,184 cycles # 3.080 GHz + 31,001,968,290 instructions # 2.42 insn per cycle + 4.157180427 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.058335e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.864325e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.864325e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.097286e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.929276e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.929276e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.453382 sec - 10,049,928,632 cycles # 2.906 GHz - 19,377,949,384 instructions # 1.93 insn per cycle - 3.458678566 seconds time elapsed +TOTAL : 3.388202 sec + 10,019,877,774 cycles # 2.954 GHz + 19,377,611,613 instructions # 1.93 insn per cycle + 3.393320382 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1965) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.139569e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.018506e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.018506e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.171888e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.054473e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.054473e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.335195 sec - 9,699,652,158 cycles # 2.904 GHz - 18,994,942,569 instructions # 1.96 insn per cycle - 3.340655484 seconds time elapsed +TOTAL : 3.283560 sec + 9,692,698,438 cycles # 2.948 GHz + 19,006,248,514 instructions # 1.96 insn per cycle + 3.288694745 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1689) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.800324e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.389989e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.389989e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.836531e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.447502e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.447502e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.895197 sec - 8,617,547,988 cycles # 2.211 GHz - 15,739,004,417 instructions # 1.83 insn per cycle - 3.900641958 seconds time elapsed +TOTAL : 3.828285 sec + 8,619,412,035 cycles # 2.250 GHz + 15,739,302,747 instructions # 1.83 insn per cycle + 3.833534805 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 900) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 60971ecd43..46e9abca4a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:50:38 +DATE: 2023-11-09_18:08:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.736559e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.745060e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.745060e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.786999e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.766835e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.766835e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.222962 sec - 7,400,904,179 cycles # 2.991 GHz - 13,138,789,289 instructions # 1.78 insn per cycle - 2.532867460 seconds time elapsed +TOTAL : 2.197852 sec + 7,407,513,320 cycles # 3.040 GHz + 13,213,549,787 instructions # 1.78 insn per cycle + 2.495471586 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.078362e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.258405e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.258405e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.082808e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.262532e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.262532e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.440995 sec - 19,547,511,222 cycles # 3.033 GHz - 44,263,760,517 instructions # 2.26 insn per cycle - 6.447379338 seconds time elapsed +TOTAL : 6.417727 sec + 19,594,664,001 cycles # 3.052 GHz + 44,265,878,138 instructions # 2.26 insn per cycle + 6.424119903 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.568240e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.019266e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.019266e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.589377e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.044221e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.044221e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.623039 sec - 14,052,579,459 cycles # 3.037 GHz - 31,844,500,266 instructions # 2.27 insn per cycle - 4.629479950 seconds time elapsed +TOTAL : 4.559857 sec + 14,005,526,343 cycles # 3.068 GHz + 31,844,006,198 instructions # 2.27 insn per cycle + 4.566322148 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.863308e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.529884e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.529884e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.929770e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.628189e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.628189e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.004138 sec - 11,314,763,691 cycles # 2.822 GHz - 20,739,815,252 instructions # 1.83 insn per cycle - 4.010963262 seconds time elapsed +TOTAL : 3.878054 sec + 11,287,723,645 cycles # 2.906 GHz + 20,738,072,181 instructions # 1.84 insn per cycle + 3.884538371 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1965) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.961498e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.695721e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.695721e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.014169e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.779352e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.779352e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.824549 sec - 10,997,567,801 cycles # 2.871 GHz - 20,355,988,697 instructions # 1.85 insn per cycle - 3.831152322 seconds time elapsed +TOTAL : 3.727856 sec + 11,041,223,612 cycles # 2.958 GHz + 20,355,670,345 instructions # 1.84 insn per cycle + 3.734291913 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1689) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.664769e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.161936e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.161936e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.744355e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.276403e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.276403e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.405341 sec - 9,931,414,577 cycles # 2.252 GHz - 16,884,401,146 instructions # 1.70 insn per cycle - 4.411803387 seconds time elapsed +TOTAL : 4.223001 sec + 9,961,082,180 cycles # 2.356 GHz + 16,884,642,255 instructions # 1.70 insn per cycle + 4.229415228 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 900) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 75e14339dc..06dd49c8ef 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_22:03:34 +DATE: 2023-11-09_18:21:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.826607e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.612761e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.962341e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.833760e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.622748e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.982780e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.301469 sec - 4,673,993,383 cycles # 3.055 GHz - 7,270,667,887 instructions # 1.56 insn per cycle - 1.586588942 seconds time elapsed +TOTAL : 1.311946 sec + 4,695,073,853 cycles # 3.035 GHz + 7,228,449,301 instructions # 1.54 insn per cycle + 1.606166442 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.143440e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.343019e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.343019e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.133856e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.330921e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.330921e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.249393 sec - 19,374,513,863 cycles # 3.098 GHz - 44,137,807,645 instructions # 2.28 insn per cycle - 6.254447436 seconds time elapsed +TOTAL : 6.295519 sec + 19,403,964,054 cycles # 3.081 GHz + 44,141,070,523 instructions # 2.27 insn per cycle + 6.300790833 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.651049e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.163460e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.163460e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.674176e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.191162e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.191162e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.566003 sec - 13,842,407,454 cycles # 3.029 GHz - 31,004,270,304 instructions # 2.24 insn per cycle - 4.571383086 seconds time elapsed +TOTAL : 4.504649 sec + 13,863,184,367 cycles # 3.075 GHz + 31,003,513,865 instructions # 2.24 insn per cycle + 4.509943224 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.085679e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.913536e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.913536e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.015608e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.805515e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.805515e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.759234 sec - 11,164,737,043 cycles # 2.967 GHz - 19,280,466,147 instructions # 1.73 insn per cycle - 3.764531843 seconds time elapsed +TOTAL : 3.880062 sec + 11,162,114,716 cycles # 2.882 GHz + 19,285,048,189 instructions # 1.73 insn per cycle + 3.885435669 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1965) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.157188e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.041275e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.041275e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.146900e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.045970e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.045970e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.667981 sec - 10,833,619,022 cycles # 2.950 GHz - 18,695,779,485 instructions # 1.73 insn per cycle - 3.673091045 seconds time elapsed +TOTAL : 3.683003 sec + 10,893,551,236 cycles # 2.955 GHz + 18,696,669,062 instructions # 1.72 insn per cycle + 3.688290519 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1689) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.852503e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.471081e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.471081e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.858668e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.475829e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.475829e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.150740 sec - 9,740,231,931 cycles # 2.344 GHz - 15,438,395,407 instructions # 1.59 insn per cycle - 4.156220859 seconds time elapsed +TOTAL : 4.138576 sec + 9,729,969,286 cycles # 2.349 GHz + 15,438,316,077 instructions # 1.59 insn per cycle + 4.143776269 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 900) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index c2852b0755..148fb0d2ee 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_22:00:21 +DATE: 2023-11-09_18:18:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.830407e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.634363e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.010779e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.853961e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.658990e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.049126e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.985656 sec - 3,531,228,063 cycles # 2.913 GHz - 6,990,251,865 instructions # 1.98 insn per cycle - 1.270939740 seconds time elapsed +TOTAL : 0.956476 sec + 3,586,792,512 cycles # 3.034 GHz + 7,163,432,319 instructions # 2.00 insn per cycle + 1.241060065 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.143065e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.342569e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.342569e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.134189e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.330626e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.330626e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.897072 sec - 18,280,833,177 cycles # 3.098 GHz - 44,034,372,908 instructions # 2.41 insn per cycle - 5.902241793 seconds time elapsed +TOTAL : 5.945995 sec + 18,306,649,766 cycles # 3.077 GHz + 44,036,304,039 instructions # 2.41 insn per cycle + 5.951221281 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.647739e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.157991e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.157991e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.656363e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.166761e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.166761e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.221863 sec - 12,803,042,604 cycles # 3.036 GHz - 31,005,296,735 instructions # 2.42 insn per cycle - 4.227230772 seconds time elapsed +TOTAL : 4.200416 sec + 12,751,192,820 cycles # 3.033 GHz + 31,001,487,666 instructions # 2.43 insn per cycle + 4.205764852 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.083518e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.912332e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.912332e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.102659e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.940126e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.940126e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.412904 sec - 10,065,358,042 cycles # 2.945 GHz - 19,377,556,628 instructions # 1.93 insn per cycle - 3.418078261 seconds time elapsed +TOTAL : 3.381762 sec + 10,061,410,412 cycles # 2.972 GHz + 19,378,394,064 instructions # 1.93 insn per cycle + 3.387061232 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1965) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.178157e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.068476e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.068476e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.165893e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.060121e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.060121e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.275616 sec - 9,709,500,834 cycles # 2.960 GHz - 18,994,586,612 instructions # 1.96 insn per cycle - 3.280821668 seconds time elapsed +TOTAL : 3.294663 sec + 9,710,957,285 cycles # 2.944 GHz + 18,994,988,980 instructions # 1.96 insn per cycle + 3.300038627 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1689) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.874008e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.497430e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.497430e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.865019e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.483923e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.483923e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.750555 sec - 8,607,389,256 cycles # 2.292 GHz - 15,737,632,725 instructions # 1.83 insn per cycle - 3.755880546 seconds time elapsed +TOTAL : 3.767379 sec + 8,603,525,039 cycles # 2.281 GHz + 15,737,455,232 instructions # 1.83 insn per cycle + 3.772597879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 900) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 6a5b6e889f..d2d2949097 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:57:07 +DATE: 2023-11-09_18:15:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.203248e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.569989e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.906875e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.240881e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.587683e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.915014e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.845079 sec - 6,274,121,781 cycles # 3.027 GHz - 11,554,949,617 instructions # 1.84 insn per cycle - 2.129841068 seconds time elapsed +TOTAL : 1.834661 sec + 6,293,478,609 cycles # 3.041 GHz + 11,504,742,224 instructions # 1.83 insn per cycle + 2.125902004 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.133729e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.330465e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.330465e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.133681e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.328323e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.328323e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.944694 sec - 18,288,311,212 cycles # 3.074 GHz - 44,034,741,687 instructions # 2.41 insn per cycle - 5.950018785 seconds time elapsed +TOTAL : 5.944399 sec + 18,276,841,424 cycles # 3.072 GHz + 44,034,753,944 instructions # 2.41 insn per cycle + 5.949724506 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.659128e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.174088e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.174088e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.688289e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.207763e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.207763e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.192977 sec - 12,790,691,952 cycles # 3.048 GHz - 31,002,731,251 instructions # 2.42 insn per cycle - 4.198334883 seconds time elapsed +TOTAL : 4.121368 sec + 12,748,827,025 cycles # 3.090 GHz + 31,001,833,202 instructions # 2.43 insn per cycle + 4.126844954 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.084534e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.927805e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.927805e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.079781e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.896967e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.896967e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.410971 sec - 10,102,470,059 cycles # 2.959 GHz - 19,378,571,736 instructions # 1.92 insn per cycle - 3.416356813 seconds time elapsed +TOTAL : 3.417656 sec + 10,039,679,603 cycles # 2.934 GHz + 19,377,458,106 instructions # 1.93 insn per cycle + 3.423002014 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1965) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.180416e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.077058e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.077058e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.191213e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.094392e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.094392e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.272531 sec - 9,723,824,348 cycles # 2.967 GHz - 19,005,371,454 instructions # 1.95 insn per cycle - 3.277801420 seconds time elapsed +TOTAL : 3.256603 sec + 9,688,244,134 cycles # 2.971 GHz + 19,005,599,231 instructions # 1.96 insn per cycle + 3.261875957 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1689) (512y: 181) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.875765e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.503453e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.503453e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.880720e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.508965e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.508965e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.745624 sec - 8,623,946,797 cycles # 2.300 GHz - 15,739,753,667 instructions # 1.83 insn per cycle - 3.750856873 seconds time elapsed +TOTAL : 3.737405 sec + 8,601,041,918 cycles # 2.299 GHz + 15,737,525,138 instructions # 1.83 insn per cycle + 3.742726567 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 900) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 3b69c80285..2943a1e3d5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:15:46 +DATE: 2023-11-09_17:37:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.519106e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.841619e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.067099e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.636703e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.863019e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.046703e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.661524 sec - 2,624,385,702 cycles # 2.945 GHz - 4,009,504,923 instructions # 1.53 insn per cycle - 0.953550123 seconds time elapsed +TOTAL : 0.654694 sec + 2,666,558,745 cycles # 3.022 GHz + 4,096,338,325 instructions # 1.54 insn per cycle + 0.944612967 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.178868e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.397031e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.397031e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.202919e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.424199e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.424199e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.737674 sec - 17,431,892,883 cycles # 3.036 GHz - 41,881,565,184 instructions # 2.40 insn per cycle - 5.743076445 seconds time elapsed +TOTAL : 5.624164 sec + 17,409,154,909 cycles # 3.093 GHz + 41,881,099,052 instructions # 2.41 insn per cycle + 5.629252249 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.685142e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.222963e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.222963e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.734385e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.287483e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.287483e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.136316 sec - 12,482,235,541 cycles # 3.016 GHz - 30,165,183,766 instructions # 2.42 insn per cycle - 4.141750487 seconds time elapsed +TOTAL : 4.020839 sec + 12,439,753,645 cycles # 3.090 GHz + 30,163,334,779 instructions # 2.42 insn per cycle + 4.026082449 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1611) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.065221e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.894043e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.894043e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.071596e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.904428e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.904428e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.443708 sec - 9,960,024,892 cycles # 2.889 GHz - 19,109,707,129 instructions # 1.92 insn per cycle - 3.449179794 seconds time elapsed +TOTAL : 3.432943 sec + 9,954,541,311 cycles # 2.896 GHz + 19,109,473,980 instructions # 1.92 insn per cycle + 3.438069931 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1930) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.139235e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.013091e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.013091e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.172502e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.071351e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.071351e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.337798 sec - 9,694,110,840 cycles # 2.900 GHz - 18,764,903,742 instructions # 1.94 insn per cycle - 3.343110507 seconds time elapsed +TOTAL : 3.287111 sec + 9,635,946,931 cycles # 2.927 GHz + 18,764,577,329 instructions # 1.95 insn per cycle + 3.292294749 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1661) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.864706e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.496201e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.496201e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.921117e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.582437e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.582437e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.773287 sec - 8,448,094,450 cycles # 2.236 GHz - 15,614,366,385 instructions # 1.85 insn per cycle - 3.778658466 seconds time elapsed +TOTAL : 3.666524 sec + 8,448,044,488 cycles # 2.302 GHz + 15,613,692,408 instructions # 1.85 insn per cycle + 3.671704856 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 886) (512y: 156) (512z: 1239) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index abd8e16103..e7918e9c23 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:39:41 +DATE: 2023-11-09_17:58:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.541150e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.656561e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.025623e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.801176e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.647831e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.027831e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.677402 sec - 2,672,042,758 cycles # 2.933 GHz - 4,104,960,698 instructions # 1.54 insn per cycle - 0.969965661 seconds time elapsed +TOTAL : 0.681809 sec + 2,713,657,783 cycles # 2.966 GHz + 4,201,847,315 instructions # 1.55 insn per cycle + 0.974362645 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.643045e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.106548e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.106548e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.699910e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.178375e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.178375e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.237683 sec - 12,698,973,738 cycles # 2.997 GHz - 32,580,365,424 instructions # 2.57 insn per cycle - 4.243310096 seconds time elapsed +TOTAL : 4.094284 sec + 12,664,884,276 cycles # 3.090 GHz + 32,577,115,805 instructions # 2.57 insn per cycle + 4.099557701 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.102523e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.004727e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.004727e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.143219e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.065278e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.065278e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.394812 sec - 10,279,599,861 cycles # 3.024 GHz - 24,505,440,482 instructions # 2.38 insn per cycle - 3.400499086 seconds time elapsed +TOTAL : 3.331773 sec + 10,271,423,521 cycles # 3.079 GHz + 24,506,625,447 instructions # 2.39 insn per cycle + 3.337328311 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.301834e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.372180e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.372180e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.319805e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.394403e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.394403e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.131325 sec - 9,114,816,336 cycles # 2.906 GHz - 16,941,253,973 instructions # 1.86 insn per cycle - 3.136898880 seconds time elapsed +TOTAL : 3.108988 sec + 9,122,185,757 cycles # 2.931 GHz + 16,942,074,182 instructions # 1.86 insn per cycle + 3.114300266 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.334227e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.444641e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.444641e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.263608e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.556489e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.556489e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.093526 sec - 8,877,539,414 cycles # 2.866 GHz - 16,358,190,505 instructions # 1.84 insn per cycle - 3.099088246 seconds time elapsed +TOTAL : 3.169374 sec + 9,426,858,565 cycles # 2.970 GHz + 16,370,203,044 instructions # 1.74 insn per cycle + 3.174743316 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.978126e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.726122e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.726122e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.105750e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.926413e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.926413e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.588578 sec - 7,927,907,472 cycles # 2.207 GHz - 14,594,253,089 instructions # 1.84 insn per cycle - 3.594362581 seconds time elapsed +TOTAL : 3.377253 sec + 7,897,254,276 cycles # 2.335 GHz + 14,592,693,571 instructions # 1.85 insn per cycle + 3.382567542 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index d14dcc2cec..676eafadb1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:40:11 +DATE: 2023-11-09_17:58:39 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.548142e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.673863e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.063444e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.818208e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.668713e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.053456e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.673757 sec - 2,682,929,459 cycles # 2.958 GHz - 4,116,085,529 instructions # 1.53 insn per cycle - 0.967020710 seconds time elapsed +TOTAL : 0.673433 sec + 2,679,233,058 cycles # 2.963 GHz + 4,187,218,910 instructions # 1.56 insn per cycle + 0.965878825 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.187961e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.086286e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.086286e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.244543e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.167572e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.167572e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.278046 sec - 9,891,835,516 cycles # 3.013 GHz - 25,457,241,379 instructions # 2.57 insn per cycle - 3.283538395 seconds time elapsed +TOTAL : 3.200260 sec + 9,840,700,159 cycles # 3.071 GHz + 25,456,933,061 instructions # 2.59 insn per cycle + 3.205821754 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.461475e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.800212e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.800212e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.515705e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.876135e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.876135e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.961448 sec - 8,958,054,464 cycles # 3.020 GHz - 21,514,605,384 instructions # 2.40 insn per cycle - 2.967091806 seconds time elapsed +TOTAL : 2.896836 sec + 8,925,793,988 cycles # 3.076 GHz + 21,514,573,078 instructions # 2.41 insn per cycle + 2.902177430 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.449114e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.718886e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.718886e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.506104e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.783990e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.783990e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.969121 sec - 8,647,101,919 cycles # 2.908 GHz - 15,830,093,651 instructions # 1.83 insn per cycle - 2.974697377 seconds time elapsed +TOTAL : 2.900756 sec + 8,606,887,419 cycles # 2.962 GHz + 15,829,788,154 instructions # 1.84 insn per cycle + 2.906279310 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.514280e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.825562e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.825562e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.541955e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.879613e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.879613e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.898480 sec - 8,435,230,503 cycles # 2.906 GHz - 15,528,950,884 instructions # 1.84 insn per cycle - 2.904204103 seconds time elapsed +TOTAL : 2.869140 sec + 8,396,471,591 cycles # 2.922 GHz + 15,529,030,850 instructions # 1.85 insn per cycle + 2.874505432 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.166244e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.072345e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.072345e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.119247e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.990719e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.990719e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.304157 sec - 7,572,571,500 cycles # 2.289 GHz - 14,293,792,931 instructions # 1.89 insn per cycle - 3.309751939 seconds time elapsed +TOTAL : 3.376497 sec + 7,569,554,118 cycles # 2.239 GHz + 14,295,014,243 instructions # 1.89 insn per cycle + 3.381953719 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index cfc01e370f..b0b6c7dbbf 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:16:19 +DATE: 2023-11-09_17:37:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.506984e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.290770e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.275463e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.535063e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.287307e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.259593e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.565965 sec - 2,321,819,505 cycles # 2.946 GHz - 3,610,558,250 instructions # 1.56 insn per cycle - 0.846354753 seconds time elapsed +TOTAL : 0.562225 sec + 2,332,457,444 cycles # 2.979 GHz + 3,625,755,159 instructions # 1.55 insn per cycle + 0.842176648 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.127208e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.335415e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.335415e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.164715e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.380430e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.380430e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.947738 sec - 17,831,603,454 cycles # 2.997 GHz - 43,615,812,813 instructions # 2.45 insn per cycle - 5.952849241 seconds time elapsed +TOTAL : 5.760009 sec + 17,802,097,031 cycles # 3.089 GHz + 43,613,527,077 instructions # 2.45 insn per cycle + 5.764750077 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.344868e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.581929e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.581929e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.392272e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.663586e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.663586e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.049781 sec - 9,255,993,248 cycles # 3.030 GHz - 21,926,767,970 instructions # 2.37 insn per cycle - 3.055067484 seconds time elapsed +TOTAL : 2.985891 sec + 9,233,559,019 cycles # 3.088 GHz + 21,925,837,880 instructions # 2.37 insn per cycle + 2.990875616 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.528612e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.886098e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.886098e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.561578e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.939602e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.939602e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.841538 sec - 8,310,122,274 cycles # 2.920 GHz - 15,590,852,784 instructions # 1.88 insn per cycle - 2.846613446 seconds time elapsed +TOTAL : 2.807792 sec + 8,302,482,665 cycles # 2.952 GHz + 15,590,734,796 instructions # 1.88 insn per cycle + 2.812825281 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2595) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.544975e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.933439e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.933439e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.577370e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.998184e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.998184e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.829740 sec - 8,228,769,997 cycles # 2.904 GHz - 15,439,791,314 instructions # 1.88 insn per cycle - 2.834839900 seconds time elapsed +TOTAL : 2.791624 sec + 8,243,582,435 cycles # 2.950 GHz + 15,435,159,534 instructions # 1.87 insn per cycle + 2.796691298 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.468064e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.774733e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.774733e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.534202e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.878199e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.878199e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.920266 sec - 6,654,443,055 cycles # 2.276 GHz - 12,870,591,658 instructions # 1.93 insn per cycle - 2.925460933 seconds time elapsed +TOTAL : 2.844579 sec + 6,638,595,923 cycles # 2.339 GHz + 12,873,058,969 instructions # 1.94 insn per cycle + 2.849721551 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1735) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index b89c0950e0..198199e430 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:51:16 +DATE: 2023-11-09_18:09:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.262139e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.843159e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.843159e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.497702e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.965150e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.965150e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.672229 sec - 5,680,712,756 cycles # 2.985 GHz - 10,249,439,391 instructions # 1.80 insn per cycle - 1.960159582 seconds time elapsed +TOTAL : 1.636588 sec + 5,687,776,927 cycles # 3.043 GHz + 10,344,643,155 instructions # 1.82 insn per cycle + 1.926222709 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.122888e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.326457e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.326457e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.124698e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.329265e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.329265e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.070293 sec - 18,467,877,178 cycles # 3.040 GHz - 43,763,046,084 instructions # 2.37 insn per cycle - 6.076144883 seconds time elapsed +TOTAL : 6.061752 sec + 18,474,797,660 cycles # 3.045 GHz + 43,763,223,756 instructions # 2.37 insn per cycle + 6.067744277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.241087e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.353707e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.353707e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.280805e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.418662e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.418662e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.295191 sec - 10,020,961,358 cycles # 3.037 GHz - 23,261,304,628 instructions # 2.32 insn per cycle - 3.301360149 seconds time elapsed +TOTAL : 3.239576 sec + 10,001,339,639 cycles # 3.083 GHz + 23,260,791,069 instructions # 2.33 insn per cycle + 3.245668541 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.364429e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.552712e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.552712e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.455472e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.697664e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.697664e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.146782 sec - 9,058,696,000 cycles # 2.874 GHz - 16,711,646,468 instructions # 1.84 insn per cycle - 3.152847146 seconds time elapsed +TOTAL : 3.034907 sec + 9,092,859,245 cycles # 2.991 GHz + 16,710,213,462 instructions # 1.84 insn per cycle + 3.041109346 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2595) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.299176e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.448559e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.448559e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.469626e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.746671e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.746671e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.242101 sec - 8,995,544,368 cycles # 2.776 GHz - 16,559,826,795 instructions # 1.84 insn per cycle - 3.248399630 seconds time elapsed +TOTAL : 3.026366 sec + 9,019,828,246 cycles # 2.976 GHz + 16,555,168,621 instructions # 1.84 insn per cycle + 3.032449491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.425438e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.624655e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.624655e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.460766e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.686068e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.686068e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.082964 sec - 7,440,102,740 cycles # 2.410 GHz - 14,077,595,444 instructions # 1.89 insn per cycle - 3.089018136 seconds time elapsed +TOTAL : 3.037604 sec + 7,413,210,247 cycles # 2.436 GHz + 14,077,138,025 instructions # 1.90 insn per cycle + 3.043934055 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1735) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index a9a0d75eb2..38db2540d0 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_22:04:10 +DATE: 2023-11-09_18:22:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.383746e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.209904e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.237350e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.382431e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.208254e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.230961e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.160269 sec - 4,203,267,927 cycles # 3.027 GHz - 6,686,907,403 instructions # 1.59 insn per cycle - 1.447760091 seconds time elapsed +TOTAL : 1.150438 sec + 4,093,367,606 cycles # 2.986 GHz + 6,655,787,532 instructions # 1.63 insn per cycle + 1.427536965 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.159461e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.377089e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.377089e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.163257e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379748e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.379748e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.100201 sec - 18,832,208,439 cycles # 3.085 GHz - 43,796,080,670 instructions # 2.33 insn per cycle - 6.105246671 seconds time elapsed +TOTAL : 6.085036 sec + 18,810,997,513 cycles # 3.089 GHz + 43,795,620,513 instructions # 2.33 insn per cycle + 6.090075734 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.360687e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.606052e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.606052e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.379076e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.642823e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.642823e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.340864 sec - 10,252,717,994 cycles # 3.065 GHz - 22,009,397,675 instructions # 2.15 insn per cycle - 3.349625818 seconds time elapsed +TOTAL : 3.315467 sec + 10,223,065,521 cycles # 3.080 GHz + 22,006,854,632 instructions # 2.15 insn per cycle + 3.320462987 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.544336e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.928692e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.928692e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.487454e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.825644e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.825644e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.145870 sec - 9,340,548,482 cycles # 2.966 GHz - 15,504,284,674 instructions # 1.66 insn per cycle - 3.151101472 seconds time elapsed +TOTAL : 3.212098 sec + 9,324,905,009 cycles # 2.900 GHz + 15,502,708,810 instructions # 1.66 insn per cycle + 3.217273015 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2595) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.556429e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.968460e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.968460e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.573485e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.002018e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.002018e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.140902 sec - 9,274,295,743 cycles # 2.952 GHz - 15,151,601,553 instructions # 1.63 insn per cycle - 3.145942426 seconds time elapsed +TOTAL : 3.120613 sec + 9,288,549,778 cycles # 2.973 GHz + 15,149,849,415 instructions # 1.63 insn per cycle + 3.125542581 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.615564e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.042980e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.042980e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.617810e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.038860e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.038860e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.083778 sec - 7,670,760,165 cycles # 2.484 GHz - 12,580,664,280 instructions # 1.64 insn per cycle - 3.088953388 seconds time elapsed +TOTAL : 3.081671 sec + 7,641,480,002 cycles # 2.476 GHz + 12,579,693,620 instructions # 1.65 insn per cycle + 3.086750346 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1735) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index e8e5add4c9..6fcc7aa480 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_22:00:55 +DATE: 2023-11-09_18:19:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.391545e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.217605e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.255851e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.390821e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.223370e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.268045e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.834564 sec - 3,199,482,421 cycles # 3.039 GHz - 6,490,454,019 instructions # 2.03 insn per cycle - 1.111753408 seconds time elapsed +TOTAL : 0.831823 sec + 3,198,187,473 cycles # 3.040 GHz + 6,464,633,768 instructions # 2.02 insn per cycle + 1.108743988 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.150543e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.366095e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.366095e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.166393e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.383502e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.383502e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.833467 sec - 17,826,844,076 cycles # 3.054 GHz - 43,615,420,578 instructions # 2.45 insn per cycle - 5.838895279 seconds time elapsed +TOTAL : 5.750668 sec + 17,811,310,529 cycles # 3.095 GHz + 43,613,299,638 instructions # 2.45 insn per cycle + 5.755604942 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.337314e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.571728e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.571728e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.317079e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.552668e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.552668e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.054886 sec - 9,243,837,324 cycles # 3.022 GHz - 21,925,827,754 instructions # 2.37 insn per cycle - 3.060063052 seconds time elapsed +TOTAL : 3.082399 sec + 9,236,711,908 cycles # 2.992 GHz + 21,926,264,881 instructions # 2.37 insn per cycle + 3.087937460 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.568595e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.965452e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.965452e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.562942e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.932578e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.932578e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.797209 sec - 8,337,217,151 cycles # 2.976 GHz - 15,590,584,627 instructions # 1.87 insn per cycle - 2.802297250 seconds time elapsed +TOTAL : 2.803489 sec + 8,311,895,996 cycles # 2.960 GHz + 15,590,591,103 instructions # 1.88 insn per cycle + 2.808434072 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2595) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.613887e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.042160e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.042160e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.582757e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.993146e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.993146e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.753850 sec - 8,236,246,865 cycles # 2.988 GHz - 15,440,580,051 instructions # 1.87 insn per cycle - 2.758988038 seconds time elapsed +TOTAL : 2.784903 sec + 8,236,233,463 cycles # 2.953 GHz + 15,439,539,485 instructions # 1.87 insn per cycle + 2.790025696 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.649804e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.085948e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.085948e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.640868e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.066609e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.066609e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.730536 sec - 6,628,841,045 cycles # 2.424 GHz - 12,869,136,387 instructions # 1.94 insn per cycle - 2.735524791 seconds time elapsed +TOTAL : 2.739279 sec + 6,618,156,482 cycles # 2.412 GHz + 12,869,303,752 instructions # 1.94 insn per cycle + 2.744541017 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1735) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 4353a0323c..ef7d7310ec 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:57:41 +DATE: 2023-11-09_18:15:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.439872e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.182276e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.152259e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.457534e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184951e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.150897e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.433462 sec - 5,039,023,930 cycles # 3.052 GHz - 9,234,566,396 instructions # 1.83 insn per cycle - 1.710073871 seconds time elapsed +TOTAL : 1.431692 sec + 5,029,016,765 cycles # 3.047 GHz + 9,191,843,408 instructions # 1.83 insn per cycle + 1.708626202 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.165155e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.381805e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.381805e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.163854e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.380282e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.380282e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.755055 sec - 17,830,794,091 cycles # 3.096 GHz - 43,613,836,777 instructions # 2.45 insn per cycle - 5.760227416 seconds time elapsed +TOTAL : 5.764377 sec + 17,805,909,761 cycles # 3.087 GHz + 43,613,494,568 instructions # 2.45 insn per cycle + 5.769597959 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.340707e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569922e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.569922e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.391849e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.652855e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.652855e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.052308 sec - 9,235,069,524 cycles # 3.022 GHz - 21,925,950,370 instructions # 2.37 insn per cycle - 3.057391403 seconds time elapsed +TOTAL : 2.987897 sec + 9,257,292,453 cycles # 3.094 GHz + 21,926,827,781 instructions # 2.37 insn per cycle + 2.993012479 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.565429e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.942662e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.942662e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.568515e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.950394e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.950394e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.806664 sec - 8,327,245,678 cycles # 2.963 GHz - 15,591,035,358 instructions # 1.87 insn per cycle - 2.811768123 seconds time elapsed +TOTAL : 2.797984 sec + 8,317,461,722 cycles # 2.968 GHz + 15,591,357,650 instructions # 1.87 insn per cycle + 2.803063629 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2595) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.574877e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.971987e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.971987e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.510607e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.874545e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.874545e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.795905 sec - 8,237,659,186 cycles # 2.942 GHz - 15,439,551,856 instructions # 1.87 insn per cycle - 2.800978610 seconds time elapsed +TOTAL : 2.865428 sec + 8,258,982,824 cycles # 2.878 GHz + 15,434,974,292 instructions # 1.87 insn per cycle + 2.870509731 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.627739e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.061419e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.061419e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.534626e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.883996e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.883996e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.749607 sec - 6,653,390,801 cycles # 2.416 GHz - 12,870,556,050 instructions # 1.93 insn per cycle - 2.754896991 seconds time elapsed +TOTAL : 2.847653 sec + 6,630,370,490 cycles # 2.325 GHz + 12,869,864,045 instructions # 1.94 insn per cycle + 2.852728913 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1735) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 4a8bf7a45a..acb88982d2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:16:49 +DATE: 2023-11-09_17:38:11 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.504004e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.299164e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.301394e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.537187e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.294303e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.293124e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.565713 sec - 2,319,019,949 cycles # 2.949 GHz - 3,628,185,594 instructions # 1.56 insn per cycle - 0.846311751 seconds time elapsed +TOTAL : 0.560273 sec + 2,360,194,998 cycles # 3.018 GHz + 3,675,767,532 instructions # 1.56 insn per cycle + 0.839402775 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.206183e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.450008e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.450008e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.245068e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.494792e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.494792e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.582273 sec - 16,756,629,307 cycles # 2.999 GHz - 41,373,009,702 instructions # 2.47 insn per cycle - 5.587382956 seconds time elapsed +TOTAL : 5.409364 sec + 16,727,058,520 cycles # 3.090 GHz + 41,371,618,921 instructions # 2.47 insn per cycle + 5.414214747 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.401015e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.738811e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.738811e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.441577e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.817766e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.817766e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.986422 sec - 9,012,092,925 cycles # 3.013 GHz - 21,229,937,185 instructions # 2.36 insn per cycle - 2.991621252 seconds time elapsed +TOTAL : 2.932681 sec + 9,069,604,999 cycles # 3.089 GHz + 21,230,786,011 instructions # 2.34 insn per cycle + 2.937680542 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1841) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.541320e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.913153e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.913153e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.599334e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.008101e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.008101e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.831515 sec - 8,274,365,196 cycles # 2.917 GHz - 15,424,948,763 instructions # 1.86 insn per cycle - 2.836960602 seconds time elapsed +TOTAL : 2.767998 sec + 8,243,229,329 cycles # 2.973 GHz + 15,424,533,858 instructions # 1.87 insn per cycle + 2.772999466 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2536) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.599740e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.051139e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.051139e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.643252e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.114551e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.114551e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.773779 sec - 8,126,258,677 cycles # 2.925 GHz - 15,238,451,861 instructions # 1.88 insn per cycle - 2.778950300 seconds time elapsed +TOTAL : 2.727085 sec + 8,130,917,009 cycles # 2.977 GHz + 15,244,999,510 instructions # 1.87 insn per cycle + 2.732127705 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.571238e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.958685e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.958685e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.551006e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.930183e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.930183e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.804796 sec - 6,629,701,677 cycles # 2.360 GHz - 12,848,530,488 instructions # 1.94 insn per cycle - 2.809910943 seconds time elapsed +TOTAL : 2.826162 sec + 6,610,785,893 cycles # 2.336 GHz + 12,848,595,223 instructions # 1.94 insn per cycle + 2.831354272 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1705) (512y: 18) (512z: 1427) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index b8155a680e..1f616951f6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:40:40 +DATE: 2023-11-09_17:59:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.302615e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.188065e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.274309e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.379623e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.224230e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.277206e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.574549 sec - 2,352,849,250 cycles # 2.917 GHz - 3,649,350,219 instructions # 1.55 insn per cycle - 0.863978578 seconds time elapsed +TOTAL : 0.567867 sec + 2,380,227,304 cycles # 3.011 GHz + 3,716,615,660 instructions # 1.56 insn per cycle + 0.847985852 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.686060e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.194010e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.194010e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.702473e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.225484e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.225484e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.097911 sec - 12,184,788,464 cycles # 2.970 GHz - 32,521,623,255 instructions # 2.67 insn per cycle - 4.103328943 seconds time elapsed +TOTAL : 4.053519 sec + 12,216,293,497 cycles # 3.011 GHz + 32,522,254,109 instructions # 2.66 insn per cycle + 4.058663851 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.770837e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.689962e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.689962e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.830691e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.806288e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.806288e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.634890 sec - 7,998,179,733 cycles # 3.030 GHz - 18,690,180,922 instructions # 2.34 insn per cycle - 2.640235037 seconds time elapsed +TOTAL : 2.580563 sec + 7,975,462,428 cycles # 3.085 GHz + 18,690,132,924 instructions # 2.34 insn per cycle + 2.585721810 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1554) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.861879e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.750654e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.750654e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.931453e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.867355e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.867355e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.559375 sec - 7,467,736,067 cycles # 2.913 GHz - 14,255,217,150 instructions # 1.91 insn per cycle - 2.564904201 seconds time elapsed +TOTAL : 2.497040 sec + 7,461,995,802 cycles # 2.983 GHz + 14,254,175,720 instructions # 1.91 insn per cycle + 2.502220546 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.908800e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.910304e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.910304e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.990445e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.025789e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.025789e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.522982 sec - 7,364,286,769 cycles # 2.913 GHz - 13,952,625,236 instructions # 1.89 insn per cycle - 2.528348787 seconds time elapsed +TOTAL : 2.453022 sec + 7,312,763,088 cycles # 2.976 GHz + 13,952,233,674 instructions # 1.91 insn per cycle + 2.458314250 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.584257e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.006941e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.006941e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.649642e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.141006e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.141006e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.801165 sec - 6,529,127,011 cycles # 2.327 GHz - 13,421,836,325 instructions # 2.06 insn per cycle - 2.806446897 seconds time elapsed +TOTAL : 2.733236 sec + 6,541,090,853 cycles # 2.390 GHz + 13,422,969,862 instructions # 2.05 insn per cycle + 2.738380923 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 385ce72d78..374f2a331e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:41:07 +DATE: 2023-11-09_17:59:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.304320e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.197410e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.300141e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.383788e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.237025e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.315197e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.574067 sec - 2,385,415,994 cycles # 2.943 GHz - 3,655,710,101 instructions # 1.53 insn per cycle - 0.868231647 seconds time elapsed +TOTAL : 0.566447 sec + 2,356,919,781 cycles # 2.991 GHz + 3,683,739,571 instructions # 1.56 insn per cycle + 0.846741071 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.254695e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.267118e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.267118e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.320968e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.384461e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.384461e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.154781 sec - 9,423,263,848 cycles # 2.983 GHz - 25,307,020,372 instructions # 2.69 insn per cycle - 3.160042496 seconds time elapsed +TOTAL : 3.069124 sec + 9,404,467,335 cycles # 3.060 GHz + 25,307,412,416 instructions # 2.69 insn per cycle + 3.074433972 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.134634e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.819272e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.819272e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.164094e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.875777e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.875777e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.372030 sec - 7,183,608,233 cycles # 3.022 GHz - 16,901,599,192 instructions # 2.35 insn per cycle - 2.377377295 seconds time elapsed +TOTAL : 2.347070 sec + 7,183,873,212 cycles # 3.055 GHz + 16,901,716,244 instructions # 2.35 insn per cycle + 2.352401841 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1359) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.035295e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.215553e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.215553e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.103853e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.343298e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.343298e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.433519 sec - 7,141,153,744 cycles # 2.929 GHz - 13,619,130,373 instructions # 1.91 insn per cycle - 2.438958453 seconds time elapsed +TOTAL : 2.377491 sec + 7,114,519,285 cycles # 2.987 GHz + 13,619,081,744 instructions # 1.91 insn per cycle + 2.382536600 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.071324e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.326333e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.326333e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.131276e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.434861e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.434861e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.408738 sec - 7,063,825,257 cycles # 2.927 GHz - 13,435,596,499 instructions # 1.90 insn per cycle - 2.414135887 seconds time elapsed +TOTAL : 2.360462 sec + 7,057,553,337 cycles # 2.985 GHz + 13,435,682,624 instructions # 1.90 insn per cycle + 2.365710938 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.750195e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.390595e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.390595e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.814153e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.521058e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.521058e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.646969 sec - 6,340,373,316 cycles # 2.391 GHz - 13,154,077,274 instructions # 2.07 insn per cycle - 2.652485679 seconds time elapsed +TOTAL : 2.589000 sec + 6,345,330,255 cycles # 2.447 GHz + 13,153,121,215 instructions # 2.07 insn per cycle + 2.594408710 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index a176ffc4e4..8dc3126453 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:17:19 +DATE: 2023-11-09_17:38:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.486918e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.802792e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.976330e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.618205e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.831793e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.977288e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.656484 sec - 2,625,682,009 cycles # 2.960 GHz - 4,099,364,380 instructions # 1.56 insn per cycle - 0.946865269 seconds time elapsed +TOTAL : 0.652821 sec + 2,648,283,165 cycles # 3.003 GHz + 4,101,874,172 instructions # 1.55 insn per cycle + 0.942277172 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.091320e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.274850e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274850e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.110302e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.297006e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.297006e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.167326 sec - 18,738,979,619 cycles # 3.037 GHz - 44,287,346,211 instructions # 2.36 insn per cycle - 6.172563885 seconds time elapsed +TOTAL : 6.061696 sec + 18,702,058,697 cycles # 3.083 GHz + 44,286,744,373 instructions # 2.37 insn per cycle + 6.066885580 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.716365e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.273883e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.273883e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.748205e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.315149e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.315149e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.065766 sec - 12,369,623,289 cycles # 3.039 GHz - 30,960,892,415 instructions # 2.50 insn per cycle - 4.071137873 seconds time elapsed +TOTAL : 3.994898 sec + 12,345,141,895 cycles # 3.087 GHz + 30,960,600,041 instructions # 2.51 insn per cycle + 4.000031168 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.040246e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.832671e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.832671e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.024705e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.805066e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.805066e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.479287 sec - 10,114,657,367 cycles # 2.903 GHz - 19,400,067,612 instructions # 1.92 insn per cycle - 3.484811762 seconds time elapsed +TOTAL : 3.505414 sec + 10,100,327,501 cycles # 2.878 GHz + 19,399,870,617 instructions # 1.92 insn per cycle + 3.510718654 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.136561e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.021650e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.021650e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.175175e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.066367e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.066367e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.335937 sec - 9,745,210,637 cycles # 2.917 GHz - 18,969,865,366 instructions # 1.95 insn per cycle - 3.341324685 seconds time elapsed +TOTAL : 3.279954 sec + 9,681,673,426 cycles # 2.948 GHz + 18,969,865,921 instructions # 1.96 insn per cycle + 3.285422855 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1859) (512y: 188) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.846714e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.476604e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.476604e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.948024e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.629123e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.629123e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.810646 sec - 8,364,453,052 cycles # 2.192 GHz - 15,065,277,596 instructions # 1.80 insn per cycle - 3.816111336 seconds time elapsed +TOTAL : 3.620894 sec + 8,364,739,572 cycles # 2.308 GHz + 15,064,814,645 instructions # 1.80 insn per cycle + 3.626218437 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 155) (512z: 1316) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 257a2b14eb..a2d87f5da8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-08_21:17:52 +DATE: 2023-11-09_17:39:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.517340e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.835074e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.047913e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.632265e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.861047e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.036900e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.657459 sec - 2,634,612,924 cycles # 2.971 GHz - 4,038,430,114 instructions # 1.53 insn per cycle - 0.947276631 seconds time elapsed +TOTAL : 0.649714 sec + 2,641,937,888 cycles # 3.008 GHz + 4,107,555,428 instructions # 1.55 insn per cycle + 0.938941535 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.135032e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.337803e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.337803e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.158637e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.370951e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.370951e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.948882 sec - 17,974,083,702 cycles # 3.020 GHz - 42,538,758,836 instructions # 2.37 insn per cycle - 5.954247483 seconds time elapsed +TOTAL : 5.823305 sec + 18,013,373,486 cycles # 3.091 GHz + 42,535,982,962 instructions # 2.36 insn per cycle + 5.828417378 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.746148e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.320939e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.320939e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.770599e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.353490e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.353490e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.005000 sec - 12,179,888,264 cycles # 3.038 GHz - 30,267,022,025 instructions # 2.48 insn per cycle - 4.010444441 seconds time elapsed +TOTAL : 3.950045 sec + 12,171,205,402 cycles # 3.078 GHz + 30,268,628,414 instructions # 2.49 insn per cycle + 3.955313835 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.065337e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.877404e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.877404e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.099406e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.925166e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.925166e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.440250 sec - 10,026,177,275 cycles # 2.911 GHz - 19,281,771,933 instructions # 1.92 insn per cycle - 3.445652030 seconds time elapsed +TOTAL : 3.385614 sec + 10,033,748,773 cycles # 2.960 GHz + 19,281,534,051 instructions # 1.92 insn per cycle + 3.390768328 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2162) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.165158e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.064737e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.064737e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.135260e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.020042e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.020042e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.297369 sec - 9,639,905,003 cycles # 2.920 GHz - 18,781,958,033 instructions # 1.95 insn per cycle - 3.302769757 seconds time elapsed +TOTAL : 3.343797 sec + 9,615,342,352 cycles # 2.872 GHz + 18,771,093,665 instructions # 1.95 insn per cycle + 3.349067283 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1833) (512y: 191) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.925761e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.602996e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.602996e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.965653e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.666391e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.666391e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.664817 sec - 8,281,446,223 cycles # 2.257 GHz - 14,988,620,827 instructions # 1.81 insn per cycle - 3.670422107 seconds time elapsed +TOTAL : 3.592114 sec + 8,278,170,966 cycles # 2.302 GHz + 14,988,534,751 instructions # 1.81 insn per cycle + 3.597402233 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1020) (512y: 156) (512z: 1305) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 06ab23436d..dad81481e1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:18:25 +DATE: 2023-11-09_17:39:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.051243e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.169781e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269231e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.113101e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.178068e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274620e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.513968 sec - 2,206,571,631 cycles # 2.965 GHz - 3,147,975,302 instructions # 1.43 insn per cycle - 0.801145911 seconds time elapsed +TOTAL : 0.513513 sec + 2,238,779,994 cycles # 3.016 GHz + 3,236,054,047 instructions # 1.45 insn per cycle + 0.800586540 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.149781e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.212668e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.212668e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.199296e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.263095e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.263095e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.981998 sec - 15,156,593,836 cycles # 3.040 GHz - 38,437,072,823 instructions # 2.54 insn per cycle - 4.987299145 seconds time elapsed +TOTAL : 4.870986 sec + 15,138,095,755 cycles # 3.105 GHz + 38,436,824,615 instructions # 2.54 insn per cycle + 4.876178872 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.640780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.838553e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.838553e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.669942e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.869262e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.869262e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.985566 sec - 9,095,215,674 cycles # 3.042 GHz - 24,591,174,592 instructions # 2.70 insn per cycle - 2.991001875 seconds time elapsed +TOTAL : 2.960626 sec + 9,095,550,717 cycles # 3.068 GHz + 24,591,504,229 instructions # 2.70 insn per cycle + 2.966139239 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2156) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.834785e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.339543e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.339543e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.803896e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.327557e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.327557e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.901490 sec - 5,454,837,265 cycles # 2.862 GHz - 11,265,546,477 instructions # 2.07 insn per cycle - 1.907039068 seconds time elapsed +TOTAL : 1.909794 sec + 5,486,817,505 cycles # 2.866 GHz + 11,265,648,347 instructions # 2.05 insn per cycle + 1.915029323 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.372557e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.993390e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.993390e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.555272e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.195980e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.195980e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.751887 sec - 4,963,717,675 cycles # 2.826 GHz - 10,572,023,161 instructions # 2.13 insn per cycle - 1.757527600 seconds time elapsed +TOTAL : 1.704245 sec + 4,927,847,485 cycles # 2.884 GHz + 10,572,013,859 instructions # 2.15 insn per cycle + 1.709455619 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.939400e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.168716e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.168716e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.103362e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.341522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.341522e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.769882 sec - 5,377,512,872 cycles # 1.939 GHz - 7,806,286,911 instructions # 1.45 insn per cycle - 2.775553290 seconds time elapsed +TOTAL : 2.658432 sec + 5,379,828,238 cycles # 2.021 GHz + 7,805,118,346 instructions # 1.45 insn per cycle + 2.663615123 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 8de158cb65..d089f3ea80 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:51:49 +DATE: 2023-11-09_18:10:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.592700e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.008872e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.008872e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.436618e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.989585e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.989585e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.804684 sec - 3,099,147,756 cycles # 2.967 GHz - 4,823,816,385 instructions # 1.56 insn per cycle - 1.102344703 seconds time elapsed +TOTAL : 0.820320 sec + 3,087,525,024 cycles # 2.881 GHz + 4,797,416,225 instructions # 1.55 insn per cycle + 1.129126082 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.051112e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.111963e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.111963e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.137936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.202451e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.202451e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.297825 sec - 15,481,852,434 cycles # 2.919 GHz - 38,496,050,546 instructions # 2.49 insn per cycle - 5.304382607 seconds time elapsed +TOTAL : 5.088780 sec + 15,506,176,025 cycles # 3.045 GHz + 38,500,320,484 instructions # 2.48 insn per cycle + 5.095207532 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.421539e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.610351e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.610351e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.664205e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.863051e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.863051e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.252273 sec - 9,439,657,096 cycles # 2.897 GHz - 24,775,783,847 instructions # 2.62 insn per cycle - 3.259008663 seconds time elapsed +TOTAL : 3.042056 sec + 9,436,538,509 cycles # 3.096 GHz + 24,774,730,249 instructions # 2.63 insn per cycle + 3.048601444 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2156) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.465972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.935898e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.935898e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.821161e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.311886e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.311886e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.107608 sec - 5,817,196,530 cycles # 2.752 GHz - 11,552,661,145 instructions # 1.99 insn per cycle - 2.114326410 seconds time elapsed +TOTAL : 1.984151 sec + 5,841,767,961 cycles # 2.936 GHz + 11,552,228,699 instructions # 1.98 insn per cycle + 1.990639911 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.009635e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.580924e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.580924e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.505257e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.122209e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.122209e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.934696 sec - 5,303,416,333 cycles # 2.735 GHz - 10,861,487,391 instructions # 2.05 insn per cycle - 1.941424882 seconds time elapsed +TOTAL : 1.793114 sec + 5,293,839,115 cycles # 2.943 GHz + 10,856,913,242 instructions # 2.05 insn per cycle + 1.799607546 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.701730e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.912869e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.912869e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.021313e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.250852e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.250852e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.025583 sec - 5,727,782,590 cycles # 1.894 GHz - 8,052,158,492 instructions # 1.41 insn per cycle - 3.032424174 seconds time elapsed +TOTAL : 2.791071 sec + 5,762,529,693 cycles # 2.060 GHz + 8,048,857,986 instructions # 1.40 insn per cycle + 2.797719094 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index fc433be1ef..d4092f872a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_22:04:42 +DATE: 2023-11-09_18:22:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.726172e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.159376e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270269e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.736311e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.160845e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271332e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.626000 sec - 2,413,951,090 cycles # 2.822 GHz - 3,508,959,445 instructions # 1.45 insn per cycle - 0.913280230 seconds time elapsed +TOTAL : 0.616077 sec + 2,487,675,163 cycles # 2.949 GHz + 3,609,155,412 instructions # 1.45 insn per cycle + 0.900867999 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.182990e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.247369e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.247369e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.176864e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.240941e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.240941e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.967265 sec - 15,332,653,861 cycles # 3.084 GHz - 38,452,810,595 instructions # 2.51 insn per cycle - 4.972510854 seconds time elapsed +TOTAL : 4.979715 sec + 15,323,819,271 cycles # 3.075 GHz + 38,452,992,607 instructions # 2.51 insn per cycle + 4.984901972 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.695457e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.898409e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.898409e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.677729e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.878488e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.878488e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.999548 sec - 9,281,583,975 cycles # 3.090 GHz - 24,591,762,393 instructions # 2.65 insn per cycle - 3.004985897 seconds time elapsed +TOTAL : 3.013873 sec + 9,290,869,776 cycles # 3.079 GHz + 24,592,367,735 instructions # 2.65 insn per cycle + 3.019043179 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2156) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.871319e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.385365e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.385365e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.850559e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.370312e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.370312e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.950157 sec - 5,690,984,261 cycles # 2.911 GHz - 11,247,762,981 instructions # 1.98 insn per cycle - 1.955461495 seconds time elapsed +TOTAL : 1.954495 sec + 5,685,208,050 cycles # 2.902 GHz + 11,247,975,749 instructions # 1.98 insn per cycle + 1.959795584 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.503413e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.137413e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.137413e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.607127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.248201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.248201e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.776614 sec - 5,148,876,403 cycles # 2.891 GHz - 10,521,901,939 instructions # 2.04 insn per cycle - 1.781976606 seconds time elapsed +TOTAL : 1.748348 sec + 5,124,696,849 cycles # 2.923 GHz + 10,520,869,381 instructions # 2.05 insn per cycle + 1.753705732 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.075607e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.312212e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.312212e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.874391e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.086224e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.086224e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.736817 sec - 5,563,466,882 cycles # 2.030 GHz - 7,754,129,949 instructions # 1.39 insn per cycle - 2.742022793 seconds time elapsed +TOTAL : 2.872375 sec + 5,588,777,867 cycles # 1.950 GHz + 7,758,258,898 instructions # 1.39 insn per cycle + 2.877703247 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index f949e08a8e..b9b046957a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_22:01:25 +DATE: 2023-11-09_18:19:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.746837e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.161251e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269946e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.737213e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157401e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270983e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.546588 sec - 2,339,106,527 cycles # 3.024 GHz - 3,639,530,742 instructions # 1.56 insn per cycle - 0.830477401 seconds time elapsed +TOTAL : 0.551870 sec + 2,343,082,954 cycles # 3.005 GHz + 3,662,705,915 instructions # 1.56 insn per cycle + 0.837059271 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.194821e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.259419e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.259419e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.189334e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.253549e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.253549e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.881299 sec - 15,162,215,504 cycles # 3.104 GHz - 38,436,564,546 instructions # 2.54 insn per cycle - 4.886593937 seconds time elapsed +TOTAL : 4.893645 sec + 15,145,823,463 cycles # 3.092 GHz + 38,436,891,323 instructions # 2.54 insn per cycle + 4.899128465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.717533e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.921164e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.921164e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.701689e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.903671e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.903671e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.924290 sec - 9,098,563,572 cycles # 3.107 GHz - 24,592,229,111 instructions # 2.70 insn per cycle - 2.929612410 seconds time elapsed +TOTAL : 2.937292 sec + 9,090,406,845 cycles # 3.091 GHz + 24,590,949,325 instructions # 2.71 insn per cycle + 2.942627315 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2156) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.896966e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.423160e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.423160e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.932093e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.448459e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.448459e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.883509 sec - 5,473,701,924 cycles # 2.899 GHz - 11,265,098,305 instructions # 2.06 insn per cycle - 1.888826353 seconds time elapsed +TOTAL : 1.870782 sec + 5,477,596,736 cycles # 2.921 GHz + 11,265,174,730 instructions # 2.06 insn per cycle + 1.876089705 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.333944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.936194e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.936194e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.470328e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.111006e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.111006e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.759678 sec - 4,959,739,230 cycles # 2.811 GHz - 10,570,009,461 instructions # 2.13 insn per cycle - 1.765083600 seconds time elapsed +TOTAL : 1.725707 sec + 4,951,306,612 cycles # 2.866 GHz + 10,571,555,034 instructions # 2.14 insn per cycle + 1.731137280 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.108089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.344532e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.344532e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.944088e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.162238e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.162238e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.655128 sec - 5,388,561,520 cycles # 2.026 GHz - 7,804,959,196 instructions # 1.45 insn per cycle - 2.660471194 seconds time elapsed +TOTAL : 2.762921 sec + 5,392,276,499 cycles # 1.949 GHz + 7,806,030,768 instructions # 1.45 insn per cycle + 2.768347372 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 6c72f6887e..655f8b81f2 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:58:12 +DATE: 2023-11-09_18:16:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.993868e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158186e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.266776e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.038584e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158740e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.268341e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.697399 sec - 2,787,983,765 cycles # 3.019 GHz - 4,369,945,413 instructions # 1.57 insn per cycle - 0.982292174 seconds time elapsed +TOTAL : 0.696596 sec + 2,778,143,738 cycles # 3.016 GHz + 4,350,451,856 instructions # 1.57 insn per cycle + 0.980250782 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.151791e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.213814e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.213814e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.182446e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.245839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.245839e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.977336 sec - 15,184,395,969 cycles # 3.048 GHz - 38,438,963,256 instructions # 2.53 insn per cycle - 4.982648512 seconds time elapsed +TOTAL : 4.909547 sec + 15,150,996,904 cycles # 3.083 GHz + 38,436,637,567 instructions # 2.54 insn per cycle + 4.914838193 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.705404e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.908313e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.908313e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.688279e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.888858e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.888858e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.933004 sec - 9,125,855,621 cycles # 3.107 GHz - 24,590,801,711 instructions # 2.69 insn per cycle - 2.938291037 seconds time elapsed +TOTAL : 2.947447 sec + 9,111,190,675 cycles # 3.087 GHz + 24,590,939,294 instructions # 2.70 insn per cycle + 2.952793630 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2156) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.720849e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.210353e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.210353e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.931624e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.461725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.461725e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.938623 sec - 5,466,827,554 cycles # 2.814 GHz - 11,265,438,862 instructions # 2.06 insn per cycle - 1.943823759 seconds time elapsed +TOTAL : 1.871366 sec + 5,440,450,573 cycles # 2.900 GHz + 11,265,206,629 instructions # 2.07 insn per cycle + 1.876659163 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.635980e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.287954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.287954e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.623582e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.268733e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.268733e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.682323 sec - 4,955,566,146 cycles # 2.937 GHz - 10,571,524,775 instructions # 2.13 insn per cycle - 1.687724736 seconds time elapsed +TOTAL : 1.687174 sec + 4,939,929,910 cycles # 2.920 GHz + 10,570,291,125 instructions # 2.14 insn per cycle + 1.692619999 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.091835e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.326386e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.326386e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.058474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.295667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.295667e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.665788 sec - 5,400,449,096 cycles # 2.023 GHz - 7,805,014,579 instructions # 1.45 insn per cycle - 2.671129758 seconds time elapsed +TOTAL : 2.687752 sec + 5,409,737,421 cycles # 2.010 GHz + 7,805,529,138 instructions # 1.44 insn per cycle + 2.693129228 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 3a0f520dcc..e703e9e5d5 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:18:52 +DATE: 2023-11-09_17:40:13 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.048585e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168286e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.265645e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.110180e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.174406e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270579e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.515938 sec - 2,194,564,244 cycles # 2.948 GHz - 3,170,767,882 instructions # 1.44 insn per cycle - 0.803319972 seconds time elapsed +TOTAL : 0.513301 sec + 2,237,705,656 cycles # 3.016 GHz + 3,206,861,926 instructions # 1.43 insn per cycle + 0.799816578 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.145803e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.208726e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.208726e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.213184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.278334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.278334e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.991935 sec - 15,019,527,641 cycles # 3.006 GHz - 40,165,389,576 instructions # 2.67 insn per cycle - 4.997467241 seconds time elapsed +TOTAL : 4.840849 sec + 15,026,294,462 cycles # 3.101 GHz + 40,163,846,165 instructions # 2.67 insn per cycle + 4.846092672 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.795270e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.015877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.015877e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.848578e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.068499e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.068499e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.867596 sec - 8,671,075,725 cycles # 3.019 GHz - 23,683,669,849 instructions # 2.73 insn per cycle - 2.873212548 seconds time elapsed +TOTAL : 2.827567 sec + 8,771,607,406 cycles # 3.097 GHz + 23,683,918,687 instructions # 2.70 insn per cycle + 2.832818835 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2069) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.180539e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.583447e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.583447e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.290749e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.696907e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.696907e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.128793 sec - 6,072,650,571 cycles # 2.846 GHz - 13,074,915,373 instructions # 2.15 insn per cycle - 2.134316674 seconds time elapsed +TOTAL : 2.084491 sec + 6,075,216,707 cycles # 2.908 GHz + 13,074,699,153 instructions # 2.15 insn per cycle + 2.089762357 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.449593e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.890564e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.890564e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.571274e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.025491e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.025491e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.028925 sec - 5,794,294,617 cycles # 2.851 GHz - 12,335,132,296 instructions # 2.13 insn per cycle - 2.034385767 seconds time elapsed +TOTAL : 1.983621 sec + 5,795,280,725 cycles # 2.915 GHz + 12,334,890,295 instructions # 2.13 insn per cycle + 1.988789955 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 294) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.645486e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.838740e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.838740e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.706784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.899846e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.899846e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.982084 sec - 5,814,493,383 cycles # 1.947 GHz - 9,613,724,456 instructions # 1.65 insn per cycle - 2.987600867 seconds time elapsed +TOTAL : 2.932528 sec + 5,816,798,800 cycles # 1.981 GHz + 9,613,398,484 instructions # 1.65 insn per cycle + 2.938057800 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1510) (512y: 209) (512z: 1971) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 1cbf67a236..a5c5a0c704 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:41:33 +DATE: 2023-11-09_17:59:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.595048e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.160670e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269203e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.735374e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.165776e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275136e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.521954 sec - 2,216,810,301 cycles # 2.935 GHz - 3,140,499,783 instructions # 1.42 insn per cycle - 0.812101303 seconds time elapsed +TOTAL : 0.522181 sec + 2,183,845,501 cycles # 2.897 GHz + 3,063,497,760 instructions # 1.40 insn per cycle + 0.813008083 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.505174e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.591402e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.591402e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.487456e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.573222e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.573222e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.291146 sec - 13,017,199,090 cycles # 3.030 GHz - 34,406,598,887 instructions # 2.64 insn per cycle - 4.296733375 seconds time elapsed +TOTAL : 4.321242 sec + 13,015,032,492 cycles # 3.009 GHz + 34,406,787,342 instructions # 2.64 insn per cycle + 4.326519493 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.106755e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.249963e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.249963e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.121956e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.266333e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.266333e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.481603 sec - 10,608,834,284 cycles # 3.044 GHz - 24,023,421,035 instructions # 2.26 insn per cycle - 3.487384559 seconds time elapsed +TOTAL : 3.465165 sec + 10,606,115,107 cycles # 3.057 GHz + 24,023,886,202 instructions # 2.27 insn per cycle + 3.470527002 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.756679e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.089717e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.089717e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.813993e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.151107e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.151107e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.309669 sec - 6,605,241,660 cycles # 2.854 GHz - 12,414,642,119 instructions # 1.88 insn per cycle - 2.315374830 seconds time elapsed +TOTAL : 2.282824 sec + 6,624,207,523 cycles # 2.896 GHz + 12,414,593,585 instructions # 1.87 insn per cycle + 2.288220203 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3156) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.883072e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.243446e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.243446e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.113256e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.489865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.489865e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.253913 sec - 6,256,146,881 cycles # 2.770 GHz - 11,588,754,266 instructions # 1.85 insn per cycle - 2.259602028 seconds time elapsed +TOTAL : 2.154567 sec + 6,244,302,737 cycles # 2.892 GHz + 11,586,784,905 instructions # 1.86 insn per cycle + 2.160119888 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2692) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.014282e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.246391e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.246391e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.080168e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.315597e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.315597e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.718420 sec - 5,340,176,505 cycles # 1.961 GHz - 9,309,276,244 instructions # 1.74 insn per cycle - 2.724177871 seconds time elapsed +TOTAL : 2.674256 sec + 5,337,021,373 cycles # 1.992 GHz + 9,309,292,596 instructions # 1.74 insn per cycle + 2.679621915 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2116) (512y: 282) (512z: 1958) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 086ff92179..04c22c3970 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:42:00 +DATE: 2023-11-09_18:00:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.601958e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.157408e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.268312e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.730812e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.162658e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271522e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.523179 sec - 2,197,044,574 cycles # 2.904 GHz - 3,180,010,549 instructions # 1.45 insn per cycle - 0.813333970 seconds time elapsed +TOTAL : 0.517390 sec + 2,237,231,843 cycles # 2.985 GHz + 3,219,482,821 instructions # 1.44 insn per cycle + 0.806478536 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.551621e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.643503e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.643503e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.686328e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.783817e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.783817e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.216286 sec - 12,375,189,012 cycles # 2.932 GHz - 35,060,083,206 instructions # 2.83 insn per cycle - 4.222169031 seconds time elapsed +TOTAL : 4.006446 sec + 12,372,456,833 cycles # 3.085 GHz + 35,059,205,099 instructions # 2.83 insn per cycle + 4.011874603 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.067813e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.209694e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.209694e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.113185e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.255090e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.255090e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.525507 sec - 10,698,056,208 cycles # 3.031 GHz - 23,100,081,560 instructions # 2.16 insn per cycle - 3.531306963 seconds time elapsed +TOTAL : 3.471336 sec + 10,684,507,667 cycles # 3.074 GHz + 23,099,965,959 instructions # 2.16 insn per cycle + 3.476724591 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.118146e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.507530e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.507530e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.172732e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.564192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.564192e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.154521 sec - 6,166,402,806 cycles # 2.856 GHz - 11,969,983,926 instructions # 1.94 insn per cycle - 2.160177772 seconds time elapsed +TOTAL : 2.130496 sec + 6,169,121,187 cycles # 2.891 GHz + 11,970,628,399 instructions # 1.94 insn per cycle + 2.136000238 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2511) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.238236e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.649069e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.649069e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.314737e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.728928e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.728928e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.108281 sec - 6,026,300,401 cycles # 2.854 GHz - 11,141,738,024 instructions # 1.85 insn per cycle - 2.114031870 seconds time elapsed +TOTAL : 2.076859 sec + 6,006,071,025 cycles # 2.885 GHz + 11,143,550,799 instructions # 1.86 insn per cycle + 2.082481137 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2128) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.978977e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.208595e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.208595e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.186490e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.434908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.434908e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.742076 sec - 5,240,960,370 cycles # 1.908 GHz - 9,033,887,762 instructions # 1.72 insn per cycle - 2.747795404 seconds time elapsed +TOTAL : 2.608202 sec + 5,201,388,823 cycles # 1.991 GHz + 9,034,449,537 instructions # 1.74 insn per cycle + 2.613510222 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1651) (512y: 208) (512z: 1567) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index eb4d5419ee..b055a915bb 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:19:21 +DATE: 2023-11-09_17:40:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.037656e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.679710e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.950060e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.058988e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.701786e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.976764e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.474624 sec - 2,093,800,407 cycles # 2.948 GHz - 2,971,543,250 instructions # 1.42 insn per cycle - 0.767958808 seconds time elapsed +TOTAL : 0.470897 sec + 2,078,401,117 cycles # 3.001 GHz + 2,953,650,991 instructions # 1.42 insn per cycle + 0.749721776 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.294584e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.370694e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.370694e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.334914e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.410542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.410542e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.654140 sec - 14,153,083,054 cycles # 3.038 GHz - 38,392,852,878 instructions # 2.71 insn per cycle - 4.659227784 seconds time elapsed +TOTAL : 4.574332 sec + 14,151,959,917 cycles # 3.091 GHz + 38,392,913,322 instructions # 2.71 insn per cycle + 4.579307325 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.142013e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.564188e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.564188e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.213719e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.641599e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.641599e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.123842 sec - 6,471,678,330 cycles # 3.041 GHz - 15,829,749,383 instructions # 2.45 insn per cycle - 2.129132115 seconds time elapsed +TOTAL : 2.094684 sec + 6,471,158,629 cycles # 3.083 GHz + 15,829,971,957 instructions # 2.45 insn per cycle + 2.099849038 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2689) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.403745e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.082517e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.082517e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.559598e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.101002e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.101002e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.198427 sec - 3,459,269,129 cycles # 2.876 GHz - 7,606,844,485 instructions # 2.20 insn per cycle - 1.203597878 seconds time elapsed +TOTAL : 1.179944 sec + 3,466,899,201 cycles # 2.927 GHz + 7,607,183,710 instructions # 2.19 insn per cycle + 1.185084453 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3051) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.005658e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168806e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.168806e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.023293e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.190211e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.190211e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.126360 sec - 3,254,355,778 cycles # 2.878 GHz - 7,215,715,994 instructions # 2.22 insn per cycle - 1.131662200 seconds time elapsed +TOTAL : 1.106259 sec + 3,248,324,558 cycles # 2.924 GHz + 7,215,751,749 instructions # 2.22 insn per cycle + 1.111467205 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2850) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.276060e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.101034e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.101034e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.338108e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.142577e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.142577e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.528725 sec - 3,068,447,705 cycles # 2.001 GHz - 5,846,027,778 instructions # 1.91 insn per cycle - 1.534029615 seconds time elapsed +TOTAL : 1.512902 sec + 3,068,145,100 cycles # 2.024 GHz + 5,846,808,445 instructions # 1.91 insn per cycle + 1.518114660 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2364) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 459315b5db..b4b4f0117a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:52:18 +DATE: 2023-11-09_18:10:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.229057e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.759945e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.759945e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.332495e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.768677e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.768677e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.663839 sec - 2,633,797,388 cycles # 2.963 GHz - 4,071,573,226 instructions # 1.55 insn per cycle - 0.947283739 seconds time elapsed +TOTAL : 0.657541 sec + 2,664,976,053 cycles # 3.017 GHz + 4,137,029,639 instructions # 1.55 insn per cycle + 0.940709573 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.280486e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.353996e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.353996e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.284632e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.359311e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.359311e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.724775 sec - 14,342,143,211 cycles # 3.033 GHz - 38,438,250,053 instructions # 2.68 insn per cycle - 4.731136861 seconds time elapsed +TOTAL : 4.717410 sec + 14,339,509,352 cycles # 3.036 GHz + 38,436,261,270 instructions # 2.68 insn per cycle + 4.723588153 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.072115e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.484269e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.484269e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.161401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.579571e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.579571e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.197377 sec - 6,673,460,854 cycles # 3.029 GHz - 16,110,044,412 instructions # 2.41 insn per cycle - 2.203637127 seconds time elapsed +TOTAL : 2.160392 sec + 6,674,034,151 cycles # 3.082 GHz + 16,110,239,223 instructions # 2.41 insn per cycle + 2.166483007 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2689) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.156025e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.050843e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.050843e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.368587e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.075649e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.075649e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.276703 sec - 3,679,224,682 cycles # 2.872 GHz - 7,844,733,298 instructions # 2.13 insn per cycle - 1.282950304 seconds time elapsed +TOTAL : 1.245937 sec + 3,665,898,836 cycles # 2.929 GHz + 7,844,268,726 instructions # 2.14 insn per cycle + 1.252070096 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3051) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.848037e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.141843e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.141843e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.007320e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.169448e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.169448e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.194194 sec - 3,452,479,238 cycles # 2.878 GHz - 7,452,050,539 instructions # 2.16 insn per cycle - 1.200346156 seconds time elapsed +TOTAL : 1.168797 sec + 3,453,510,139 cycles # 2.941 GHz + 7,453,168,499 instructions # 2.16 insn per cycle + 1.174935345 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2850) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.221197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.012402e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.012402e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.465484e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.304262e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.304262e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.583142 sec - 3,273,382,507 cycles # 2.061 GHz - 6,100,795,667 instructions # 1.86 insn per cycle - 1.589319377 seconds time elapsed +TOTAL : 1.534111 sec + 3,274,248,388 cycles # 2.127 GHz + 6,100,577,921 instructions # 1.86 insn per cycle + 1.540213764 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2364) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index dcdda81950..375a817a79 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_22:05:09 +DATE: 2023-11-09_18:23:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.826188e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.648877e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.951378e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.824516e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.637814e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.946525e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.557947 sec - 2,332,705,336 cycles # 3.000 GHz - 3,420,801,676 instructions # 1.47 insn per cycle - 0.836912289 seconds time elapsed +TOTAL : 0.570368 sec + 2,261,358,530 cycles # 2.855 GHz + 3,305,358,456 instructions # 1.46 insn per cycle + 0.849017060 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.339471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.416548e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.416548e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.325738e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.401234e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.401234e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.618720 sec - 14,313,897,069 cycles # 3.097 GHz - 38,421,663,028 instructions # 2.68 insn per cycle - 4.623775275 seconds time elapsed +TOTAL : 4.645582 sec + 14,325,809,375 cycles # 3.082 GHz + 38,422,987,894 instructions # 2.68 insn per cycle + 4.650648560 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.232630e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.661001e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.661001e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.201320e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.630090e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.630090e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.140530 sec - 6,636,885,571 cycles # 3.094 GHz - 15,842,171,589 instructions # 2.39 insn per cycle - 2.145594820 seconds time elapsed +TOTAL : 2.152860 sec + 6,643,060,083 cycles # 3.080 GHz + 15,842,584,477 instructions # 2.38 insn per cycle + 2.158023571 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2689) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.545031e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.097804e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.097804e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.450401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.089441e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.089441e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.233588 sec - 3,635,079,459 cycles # 2.936 GHz - 7,590,685,166 instructions # 2.09 insn per cycle - 1.238746125 seconds time elapsed +TOTAL : 1.246774 sec + 3,643,683,352 cycles # 2.913 GHz + 7,592,040,005 instructions # 2.08 insn per cycle + 1.251723719 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3051) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.024875e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195413e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.195413e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.014057e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180349e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180349e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.160670 sec - 3,429,453,475 cycles # 2.944 GHz - 7,166,679,947 instructions # 2.09 insn per cycle - 1.165684786 seconds time elapsed +TOTAL : 1.172023 sec + 3,431,252,645 cycles # 2.917 GHz + 7,165,511,136 instructions # 2.09 insn per cycle + 1.177142051 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2850) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.262300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.049639e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.049639e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.431571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.259454e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.259454e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.582365 sec - 3,235,924,413 cycles # 2.039 GHz - 5,796,611,749 instructions # 1.79 insn per cycle - 1.587507042 seconds time elapsed +TOTAL : 1.550630 sec + 3,238,644,111 cycles # 2.083 GHz + 5,796,702,494 instructions # 1.79 insn per cycle + 1.555869344 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2364) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 831fd0fa9f..573aa8a1a6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_22:01:51 +DATE: 2023-11-09_18:20:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.837632e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.654775e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.958238e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.875401e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.666103e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.969743e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.503341 sec - 2,173,332,424 cycles # 3.019 GHz - 3,385,289,251 instructions # 1.56 insn per cycle - 0.779359289 seconds time elapsed +TOTAL : 0.505509 sec + 2,155,710,329 cycles # 2.977 GHz + 3,399,528,814 instructions # 1.58 insn per cycle + 0.781726612 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.329232e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.405368e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.405368e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.331654e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.407717e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.407717e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.586570 sec - 14,159,897,717 cycles # 3.085 GHz - 38,395,355,740 instructions # 2.71 insn per cycle - 4.591702989 seconds time elapsed +TOTAL : 4.581584 sec + 14,155,354,915 cycles # 3.087 GHz + 38,394,211,404 instructions # 2.71 insn per cycle + 4.586893992 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.170239e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.592491e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.592491e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.232934e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.666455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.666455e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.112173 sec - 6,472,075,786 cycles # 3.058 GHz - 15,829,638,315 instructions # 2.45 insn per cycle - 2.117221818 seconds time elapsed +TOTAL : 2.087317 sec + 6,475,857,503 cycles # 3.096 GHz + 15,829,568,301 instructions # 2.44 insn per cycle + 2.092497637 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2689) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.605537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.104706e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.104706e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.589829e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.103388e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.103388e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.174316 sec - 3,462,364,333 cycles # 2.937 GHz - 7,606,467,395 instructions # 2.20 insn per cycle - 1.179522425 seconds time elapsed +TOTAL : 1.175635 sec + 3,460,928,709 cycles # 2.933 GHz + 7,606,660,397 instructions # 2.20 insn per cycle + 1.180756657 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3051) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.024286e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.190805e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.190805e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.939659e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155071e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155071e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.105549 sec - 3,254,375,411 cycles # 2.932 GHz - 7,215,571,393 instructions # 2.22 insn per cycle - 1.110519445 seconds time elapsed +TOTAL : 1.139254 sec + 3,252,781,739 cycles # 2.845 GHz + 7,214,861,555 instructions # 2.22 insn per cycle + 1.144377149 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2850) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.518662e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.361331e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.361331e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.585994e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.448568e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.448568e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.478873 sec - 3,068,230,484 cycles # 2.069 GHz - 5,846,211,473 instructions # 1.91 insn per cycle - 1.484040601 seconds time elapsed +TOTAL : 1.467081 sec + 3,063,258,508 cycles # 2.082 GHz + 5,845,738,451 instructions # 1.91 insn per cycle + 1.472345808 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2364) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index bb838a2196..415792c712 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:58:39 +DATE: 2023-11-09_18:16:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.130902e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.643491e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.939128e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.158996e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.650796e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951969e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.604908 sec - 2,484,417,262 cycles # 3.021 GHz - 3,852,149,899 instructions # 1.55 insn per cycle - 0.881326202 seconds time elapsed +TOTAL : 0.604748 sec + 2,477,150,875 cycles # 3.008 GHz + 3,827,452,997 instructions # 1.55 insn per cycle + 0.882802717 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.328989e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.404078e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.404078e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.278227e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.352095e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.352095e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.586292 sec - 14,210,336,618 cycles # 3.096 GHz - 38,392,847,533 instructions # 2.70 insn per cycle - 4.591549142 seconds time elapsed +TOTAL : 4.688006 sec + 14,149,964,703 cycles # 3.016 GHz + 38,393,052,805 instructions # 2.71 insn per cycle + 4.693060305 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.239674e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.668279e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.668279e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.195001e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.620625e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.620625e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.084661 sec - 6,470,762,281 cycles # 3.098 GHz - 15,829,570,536 instructions # 2.45 insn per cycle - 2.089664033 seconds time elapsed +TOTAL : 2.102860 sec + 6,473,914,859 cycles # 3.072 GHz + 15,829,595,595 instructions # 2.45 insn per cycle + 2.107994821 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2689) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.589227e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.103396e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.103396e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.498213e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.092419e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.092419e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.175545 sec - 3,466,544,418 cycles # 2.938 GHz - 7,606,584,140 instructions # 2.19 insn per cycle - 1.180575347 seconds time elapsed +TOTAL : 1.186261 sec + 3,464,671,010 cycles # 2.910 GHz + 7,606,636,115 instructions # 2.20 insn per cycle + 1.191422669 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3051) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.024662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.193480e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.193480e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.018341e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184021e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184021e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.105660 sec - 3,258,740,690 cycles # 2.936 GHz - 7,215,101,525 instructions # 2.21 insn per cycle - 1.110765672 seconds time elapsed +TOTAL : 1.113056 sec + 3,253,634,801 cycles # 2.912 GHz + 7,214,825,947 instructions # 2.22 insn per cycle + 1.118242022 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2850) (512y: 23) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.584208e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.436586e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.436586e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.525206e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.371211e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.371211e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.465958 sec - 3,064,168,908 cycles # 2.084 GHz - 5,845,466,179 instructions # 1.91 insn per cycle - 1.471139277 seconds time elapsed +TOTAL : 1.477022 sec + 3,066,754,541 cycles # 2.070 GHz + 5,845,673,759 instructions # 1.91 insn per cycle + 1.482222084 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2364) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index d667b6dbf4..dbd0c88759 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:19:44 +DATE: 2023-11-09_17:41:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.049999e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.742417e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.025106e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.062894e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.751636e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.032491e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.475958 sec - 2,061,164,716 cycles # 2.907 GHz - 2,917,299,650 instructions # 1.42 insn per cycle - 0.766837667 seconds time elapsed +TOTAL : 0.471370 sec + 2,069,916,039 cycles # 2.986 GHz + 2,893,797,319 instructions # 1.40 insn per cycle + 0.749930288 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.217835e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.287538e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.287538e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.241816e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.314075e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.314075e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.813699 sec - 14,428,562,676 cycles # 2.998 GHz - 39,888,508,384 instructions # 2.76 insn per cycle - 4.818824247 seconds time elapsed +TOTAL : 4.763197 sec + 14,419,363,408 cycles # 3.025 GHz + 39,885,822,805 instructions # 2.77 insn per cycle + 4.768145939 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.957468e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.536679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.536679e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.077159e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.666017e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.666017e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.845039 sec - 5,590,599,138 cycles # 3.023 GHz - 15,299,534,426 instructions # 2.74 insn per cycle - 1.850198462 seconds time elapsed +TOTAL : 1.809175 sec + 5,591,744,554 cycles # 3.083 GHz + 15,300,029,522 instructions # 2.74 insn per cycle + 1.814409785 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2473) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.651061e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.332537e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.332537e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.801496e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.504366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.504366e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.660892 sec - 4,740,556,619 cycles # 2.846 GHz - 9,747,822,441 instructions # 2.06 insn per cycle - 1.666191221 seconds time elapsed +TOTAL : 1.624464 sec + 4,741,141,330 cycles # 2.911 GHz + 9,747,661,132 instructions # 2.06 insn per cycle + 1.629561959 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3710) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.778515e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.494686e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.494686e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.005329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.745480e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.745480e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.631450 sec - 4,628,439,590 cycles # 2.829 GHz - 9,339,816,116 instructions # 2.02 insn per cycle - 1.636603727 seconds time elapsed +TOTAL : 1.578447 sec + 4,623,271,493 cycles # 2.921 GHz + 9,339,033,786 instructions # 2.02 insn per cycle + 1.583594825 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3497) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.981004e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.517698e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.517698e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.210537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.774289e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.774289e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.837853 sec - 3,663,588,168 cycles # 1.989 GHz - 7,045,799,249 instructions # 1.92 insn per cycle - 1.843187351 seconds time elapsed +TOTAL : 1.770504 sec + 3,648,791,259 cycles # 2.056 GHz + 7,045,498,641 instructions # 1.93 insn per cycle + 1.775670307 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2606) (512y: 12) (512z: 2221) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index e94beeddac..c0790b6e36 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:42:28 +DATE: 2023-11-09_18:00:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.362873e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.640443e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.957691e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.858794e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.673199e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.981057e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478743 sec - 2,066,773,251 cycles # 2.940 GHz - 2,882,191,672 instructions # 1.39 insn per cycle - 0.760603829 seconds time elapsed +TOTAL : 0.474263 sec + 2,125,063,536 cycles # 3.002 GHz + 3,025,852,918 instructions # 1.42 insn per cycle + 0.764897313 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.571240e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.665961e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.665961e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.589894e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.682971e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.682971e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.163303 sec - 12,605,463,394 cycles # 3.025 GHz - 34,393,608,512 instructions # 2.73 insn per cycle - 4.168641817 seconds time elapsed +TOTAL : 4.133418 sec + 12,609,458,975 cycles # 3.048 GHz + 34,395,001,210 instructions # 2.73 insn per cycle + 4.138439483 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.401759e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.886488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.886488e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.435122e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.914251e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.914251e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.027469 sec - 6,100,742,722 cycles # 3.002 GHz - 14,874,619,740 instructions # 2.44 insn per cycle - 2.032997684 seconds time elapsed +TOTAL : 2.014773 sec + 6,085,710,075 cycles # 3.014 GHz + 14,874,327,590 instructions # 2.44 insn per cycle + 2.020198945 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.152588e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.984648e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.984648e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.550169e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.423492e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.423492e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.570348 sec - 4,280,521,919 cycles # 2.743 GHz - 9,042,316,644 instructions # 2.11 insn per cycle - 1.575934676 seconds time elapsed +TOTAL : 1.471458 sec + 4,290,277,982 cycles # 2.907 GHz + 9,041,954,393 instructions # 2.11 insn per cycle + 1.476543510 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4445) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.548985e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.445828e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.445828e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.705610e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.621776e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.621776e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.472831 sec - 4,206,089,473 cycles # 2.847 GHz - 8,677,889,358 instructions # 2.06 insn per cycle - 1.478375348 seconds time elapsed +TOTAL : 1.443048 sec + 4,208,694,980 cycles # 2.909 GHz + 8,677,287,895 instructions # 2.06 insn per cycle + 1.448442097 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4244) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.660562e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.137441e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.137441e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.842247e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.341676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.341676e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.938115 sec - 3,846,715,012 cycles # 1.980 GHz - 7,820,097,651 instructions # 2.03 insn per cycle - 1.943482590 seconds time elapsed +TOTAL : 1.878702 sec + 3,847,091,668 cycles # 2.044 GHz + 7,820,914,226 instructions # 2.03 insn per cycle + 1.883936977 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4420) (512y: 0) (512z: 2556) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index a8a81cca05..a8fdecb532 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:42:52 +DATE: 2023-11-09_18:01:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.468219e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.688670e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.018561e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.862525e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.715295e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.030318e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.479145 sec - 2,060,928,745 cycles # 2.937 GHz - 2,943,965,642 instructions # 1.43 insn per cycle - 0.760902085 seconds time elapsed +TOTAL : 0.474547 sec + 2,129,920,938 cycles # 3.015 GHz + 3,022,843,622 instructions # 1.42 insn per cycle + 0.763729431 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.752408e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.860428e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.860428e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.720126e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.824009e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.824009e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.895863 sec - 11,764,358,308 cycles # 3.017 GHz - 35,130,105,613 instructions # 2.99 insn per cycle - 3.901121829 seconds time elapsed +TOTAL : 3.938814 sec + 11,787,930,920 cycles # 2.995 GHz + 35,134,515,128 instructions # 2.98 insn per cycle + 3.943783291 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.491671e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.980976e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.980976e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.688740e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.207831e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.207831e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.995272 sec - 5,963,721,442 cycles # 2.982 GHz - 14,483,479,258 instructions # 2.43 insn per cycle - 2.000909308 seconds time elapsed +TOTAL : 1.927645 sec + 5,955,477,747 cycles # 3.083 GHz + 14,483,875,890 instructions # 2.43 insn per cycle + 1.932605425 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.606859e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.529662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.529662e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.792092e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.717382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.717382e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.463863 sec - 4,171,268,875 cycles # 2.840 GHz - 8,887,248,415 instructions # 2.13 insn per cycle - 1.469508622 seconds time elapsed +TOTAL : 1.428222 sec + 4,172,426,658 cycles # 2.912 GHz + 8,888,638,577 instructions # 2.13 insn per cycle + 1.433579963 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3576) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.334017e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.185528e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.185528e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.830183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.777369e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.777369e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.515911 sec - 4,141,896,373 cycles # 2.724 GHz - 8,425,434,947 instructions # 2.03 insn per cycle - 1.521361653 seconds time elapsed +TOTAL : 1.421326 sec + 4,143,555,691 cycles # 2.906 GHz + 8,424,122,393 instructions # 2.03 insn per cycle + 1.426420575 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3320) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.735035e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.250427e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.250427e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.911357e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.422090e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.422090e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.913707 sec - 3,815,274,575 cycles # 1.989 GHz - 7,713,047,642 instructions # 2.02 insn per cycle - 1.919181973 seconds time elapsed +TOTAL : 1.856974 sec + 3,783,077,119 cycles # 2.033 GHz + 7,713,045,733 instructions # 2.04 insn per cycle + 1.862087187 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3436) (512y: 0) (512z: 2108) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 1d637e1269..bc7d9de588 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:20:08 +DATE: 2023-11-09_17:41:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.064819e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168761e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.265943e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.109904e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.171630e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269382e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.516845 sec - 2,194,660,841 cycles # 2.941 GHz - 3,161,612,621 instructions # 1.44 insn per cycle - 0.804942538 seconds time elapsed +TOTAL : 0.511961 sec + 2,222,105,943 cycles # 3.001 GHz + 3,180,029,506 instructions # 1.43 insn per cycle + 0.797868325 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.076007e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.135159e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.135159e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.142732e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204413e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204413e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.157074 sec - 15,456,785,340 cycles # 2.995 GHz - 38,638,875,955 instructions # 2.50 insn per cycle - 5.162652658 seconds time elapsed +TOTAL : 4.999043 sec + 15,266,738,883 cycles # 3.052 GHz + 38,639,692,678 instructions # 2.53 insn per cycle + 5.004417103 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.689929e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.902707e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.902707e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.675686e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.874485e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.874485e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.947066 sec - 8,960,192,906 cycles # 3.035 GHz - 24,239,204,206 instructions # 2.71 insn per cycle - 2.952599117 seconds time elapsed +TOTAL : 2.956696 sec + 8,943,278,567 cycles # 3.020 GHz + 24,239,461,473 instructions # 2.71 insn per cycle + 2.961985342 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.870612e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.391820e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.391820e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.810568e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.309343e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.309343e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.891319 sec - 5,424,929,342 cycles # 2.862 GHz - 11,287,630,140 instructions # 2.08 insn per cycle - 1.896741262 seconds time elapsed +TOTAL : 1.907942 sec + 5,390,382,442 cycles # 2.818 GHz + 11,287,870,279 instructions # 2.09 insn per cycle + 1.913175131 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.626799e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.289896e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.289896e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.736389e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.412733e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.412733e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.686295 sec - 4,842,859,663 cycles # 2.864 GHz - 10,535,885,470 instructions # 2.18 insn per cycle - 1.691658185 seconds time elapsed +TOTAL : 1.660808 sec + 4,859,407,660 cycles # 2.918 GHz + 10,535,709,652 instructions # 2.17 insn per cycle + 1.666185530 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.120532e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.365927e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.365927e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.170238e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.418556e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.418556e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.650947 sec - 5,210,620,634 cycles # 1.962 GHz - 7,614,639,902 instructions # 1.46 insn per cycle - 2.656437650 seconds time elapsed +TOTAL : 2.618902 sec + 5,253,729,468 cycles # 2.003 GHz + 7,613,729,309 instructions # 1.45 insn per cycle + 2.624316082 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 92e3c9f0b5..008a5e172d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-08_21:20:35 +DATE: 2023-11-09_17:41:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.066522e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.173508e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273022e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.128890e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181968e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279178e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.512769 sec - 2,197,876,209 cycles # 2.961 GHz - 3,170,940,757 instructions # 1.44 insn per cycle - 0.799563998 seconds time elapsed +TOTAL : 0.513214 sec + 2,219,973,022 cycles # 2.991 GHz + 3,202,428,118 instructions # 1.44 insn per cycle + 0.799522630 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.111886e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.172848e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.172848e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.124085e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.184530e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.184530e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.069953 sec - 15,385,884,321 cycles # 3.032 GHz - 40,433,272,287 instructions # 2.63 insn per cycle - 5.075349465 seconds time elapsed +TOTAL : 5.039921 sec + 15,384,037,518 cycles # 3.050 GHz + 40,433,132,851 instructions # 2.63 insn per cycle + 5.045085372 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.654822e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.859127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.859127e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.855191e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.079392e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.079392e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.975229 sec - 8,506,893,399 cycles # 2.855 GHz - 23,270,886,855 instructions # 2.74 insn per cycle - 2.980696937 seconds time elapsed +TOTAL : 2.823965 sec + 8,503,215,845 cycles # 3.006 GHz + 23,269,764,862 instructions # 2.74 insn per cycle + 2.829223148 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.053911e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.431363e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.431363e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.125017e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.510855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.510855e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.179721 sec - 6,241,572,834 cycles # 2.857 GHz - 12,973,482,438 instructions # 2.08 insn per cycle - 2.185137091 seconds time elapsed +TOTAL : 2.149257 sec + 6,265,408,652 cycles # 2.910 GHz + 12,973,997,697 instructions # 2.07 insn per cycle + 2.154583439 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.331614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.744905e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.744905e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.427179e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.860121e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.860121e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.072194 sec - 5,929,542,555 cycles # 2.855 GHz - 12,251,825,862 instructions # 2.07 insn per cycle - 2.077717224 seconds time elapsed +TOTAL : 2.035544 sec + 5,944,578,726 cycles # 2.915 GHz + 12,250,352,313 instructions # 2.06 insn per cycle + 2.040880399 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.800727e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.013912e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.013912e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.896609e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.113493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.113493e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.863923 sec - 5,611,513,288 cycles # 1.956 GHz - 8,753,901,381 instructions # 1.56 insn per cycle - 2.869313331 seconds time elapsed +TOTAL : 2.794321 sec + 5,604,210,205 cycles # 2.003 GHz + 8,753,670,387 instructions # 1.56 insn per cycle + 2.799501421 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 87df63c965..a6a310dca7 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-08_21:21:03 +DATE: 2023-11-09_17:42:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.879738e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.041736e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.055795e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.987135e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.050792e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.063302e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.461849 sec - 1,973,375,466 cycles # 2.915 GHz - 2,850,187,396 instructions # 1.44 insn per cycle - 0.733799311 seconds time elapsed +TOTAL : 0.461215 sec + 2,013,982,440 cycles # 2.996 GHz + 2,888,271,641 instructions # 1.43 insn per cycle + 0.731639311 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.114902e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.320626e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.332328e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.121271e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.323663e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.335167e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.597626 sec - 2,460,714,562 cycles # 2.956 GHz - 3,716,258,767 instructions # 1.51 insn per cycle - 0.892242937 seconds time elapsed +TOTAL : 0.596567 sec + 2,489,603,363 cycles # 2.997 GHz + 3,769,346,991 instructions # 1.51 insn per cycle + 0.890614911 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.537254e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.549613e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.549613e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.576698e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.589005e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.589005e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.480284 sec - 19,731,245,814 cycles # 3.044 GHz - 59,610,628,892 instructions # 3.02 insn per cycle - 6.484553626 seconds time elapsed +TOTAL : 6.380855 sec + 19,728,048,826 cycles # 3.090 GHz + 59,610,032,345 instructions # 3.02 insn per cycle + 6.384875624 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.819525e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.864015e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.864015e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.837473e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.882254e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.882254e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.421528 sec - 10,361,656,121 cycles # 3.025 GHz - 30,678,833,436 instructions # 2.96 insn per cycle - 3.425797412 seconds time elapsed +TOTAL : 3.409518 sec + 10,359,121,121 cycles # 3.036 GHz + 30,679,203,213 instructions # 2.96 insn per cycle + 3.413745701 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.328413e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.498915e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.498915e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.786469e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.964416e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.964416e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.779184 sec - 4,885,070,909 cycles # 2.740 GHz - 11,021,940,228 instructions # 2.26 insn per cycle - 1.783393950 seconds time elapsed +TOTAL : 1.696222 sec + 4,887,496,480 cycles # 2.875 GHz + 11,021,602,656 instructions # 2.26 insn per cycle + 1.700511665 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.089421e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.111598e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.111598e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.093744e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.115987e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.115987e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.526514 sec - 4,365,565,996 cycles # 2.854 GHz - 10,298,805,774 instructions # 2.36 insn per cycle - 1.530732946 seconds time elapsed +TOTAL : 1.520406 sec + 4,369,323,760 cycles # 2.867 GHz + 10,298,269,078 instructions # 2.36 insn per cycle + 1.524718704 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.324075e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.430754e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.430754e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.753883e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.865687e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.865687e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.262206 sec - 4,104,673,936 cycles # 1.812 GHz - 5,846,278,322 instructions # 1.42 insn per cycle - 2.266456846 seconds time elapsed +TOTAL : 2.137350 sec + 4,099,012,031 cycles # 1.915 GHz + 5,845,815,520 instructions # 1.43 insn per cycle + 2.141590310 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index a8aafca020..47e341807c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-08_21:52:42 +DATE: 2023-11-09_18:10:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.668584e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.838174e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.838174e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.707712e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.862456e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.862456e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.491390 sec - 2,056,116,630 cycles # 2.930 GHz - 3,087,605,373 instructions # 1.50 insn per cycle - 0.760599439 seconds time elapsed +TOTAL : 0.491938 sec + 2,095,418,329 cycles # 2.943 GHz + 3,181,792,573 instructions # 1.52 insn per cycle + 0.771165711 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.753470e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.636054e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.636054e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.763222e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.617411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.617411e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.817784 sec - 3,130,594,447 cycles # 2.944 GHz - 4,997,770,241 instructions # 1.60 insn per cycle - 1.126915791 seconds time elapsed +TOTAL : 0.818666 sec + 3,177,291,822 cycles # 2.975 GHz + 5,098,451,441 instructions # 1.60 insn per cycle + 1.129356217 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.533314e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.546135e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.546135e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.524248e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.536588e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.536588e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.496533 sec - 19,730,935,453 cycles # 3.036 GHz - 59,615,663,798 instructions # 3.02 insn per cycle - 6.500895427 seconds time elapsed +TOTAL : 6.519126 sec + 19,771,628,211 cycles # 3.032 GHz + 59,619,366,283 instructions # 3.02 insn per cycle + 6.523440391 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.824473e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.869855e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.869855e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.881918e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.927973e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.927973e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.425054 sec - 10,403,336,159 cycles # 3.035 GHz - 30,728,089,368 instructions # 2.95 insn per cycle - 3.429466512 seconds time elapsed +TOTAL : 3.385706 sec + 10,402,667,023 cycles # 3.069 GHz + 30,728,506,666 instructions # 2.95 insn per cycle + 3.390173573 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.541398e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.724381e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.724381e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.797699e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.978652e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.978652e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.747981 sec - 4,923,635,172 cycles # 2.811 GHz - 11,072,838,099 instructions # 2.25 insn per cycle - 1.752609449 seconds time elapsed +TOTAL : 1.701840 sec + 4,920,530,137 cycles # 2.885 GHz + 11,072,335,054 instructions # 2.25 insn per cycle + 1.706256708 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.072827e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095239e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.095239e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.099458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.122078e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.122078e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.557290 sec - 4,408,906,008 cycles # 2.824 GHz - 10,349,337,234 instructions # 2.35 insn per cycle - 1.561766662 seconds time elapsed +TOTAL : 1.518361 sec + 4,398,354,549 cycles # 2.890 GHz + 10,347,368,561 instructions # 2.35 insn per cycle + 1.522642923 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.462789e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.573036e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.573036e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.773044e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.885749e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.885749e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.226828 sec - 4,140,433,235 cycles # 1.856 GHz - 5,883,947,133 instructions # 1.42 insn per cycle - 2.231231918 seconds time elapsed +TOTAL : 2.139176 sec + 4,134,059,026 cycles # 1.929 GHz + 5,885,050,529 instructions # 1.42 insn per cycle + 2.143583199 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 2485d7fbb8..de9a4f17b0 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-08_21:21:32 +DATE: 2023-11-09_17:42:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.914793e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.044227e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.057322e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.944811e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.043287e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055886e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.462395 sec - 2,001,608,406 cycles # 2.941 GHz - 2,866,642,977 instructions # 1.43 insn per cycle - 0.738112039 seconds time elapsed +TOTAL : 0.460458 sec + 2,026,281,331 cycles # 3.005 GHz + 2,900,924,761 instructions # 1.43 insn per cycle + 0.731473724 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.109030e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.310930e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.322842e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.115492e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.315818e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.327216e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.592309 sec - 2,454,004,684 cycles # 2.967 GHz - 3,701,468,710 instructions # 1.51 insn per cycle - 0.885901852 seconds time elapsed +TOTAL : 0.589562 sec + 2,467,189,653 cycles # 3.006 GHz + 3,742,728,616 instructions # 1.52 insn per cycle + 0.882301885 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.546247e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.558939e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.558939e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.562701e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.575539e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.575539e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.457597 sec - 19,573,619,879 cycles # 3.030 GHz - 58,802,481,580 instructions # 3.00 insn per cycle - 6.461777687 seconds time elapsed +TOTAL : 6.415806 sec + 19,556,589,093 cycles # 3.047 GHz + 58,802,097,142 instructions # 3.01 insn per cycle + 6.419943255 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.793642e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.840400e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.840400e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.964793e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.010479e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.010479e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.440445 sec - 10,252,301,234 cycles # 2.977 GHz - 30,351,085,669 instructions # 2.96 insn per cycle - 3.444877379 seconds time elapsed +TOTAL : 3.321576 sec + 10,234,879,480 cycles # 3.078 GHz + 30,349,718,565 instructions # 2.97 insn per cycle + 3.325925546 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.384802e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.551869e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.551869e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.508412e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.675254e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.675254e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.768254 sec - 5,044,938,195 cycles # 2.848 GHz - 11,486,596,301 instructions # 2.28 insn per cycle - 1.772428896 seconds time elapsed +TOTAL : 1.744975 sec + 5,046,123,954 cycles # 2.887 GHz + 11,486,788,981 instructions # 2.28 insn per cycle + 1.749151834 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.019018e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.038703e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.038703e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.033659e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.053692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.053692e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.630183 sec - 4,647,706,592 cycles # 2.845 GHz - 10,845,108,593 instructions # 2.33 insn per cycle - 1.634411362 seconds time elapsed +TOTAL : 1.606653 sec + 4,645,095,124 cycles # 2.885 GHz + 10,843,590,320 instructions # 2.33 insn per cycle + 1.610949978 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.188773e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.290125e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.290125e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.741864e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.853507e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.853507e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.304290 sec - 4,123,403,300 cycles # 1.794 GHz - 6,113,558,333 instructions # 1.48 insn per cycle - 2.308644720 seconds time elapsed +TOTAL : 2.140995 sec + 4,112,867,345 cycles # 1.919 GHz + 6,110,383,002 instructions # 1.49 insn per cycle + 2.145162136 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 0b448796b2..f7b3cf47d9 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-08_21:22:02 +DATE: 2023-11-09_17:43:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.567286e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.376211e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.468457e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.559244e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.332615e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.416599e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.444374 sec - 1,959,403,583 cycles # 2.932 GHz - 2,755,627,615 instructions # 1.41 insn per cycle - 0.725331091 seconds time elapsed +TOTAL : 0.442703 sec + 1,956,548,432 cycles # 2.973 GHz + 2,743,818,395 instructions # 1.40 insn per cycle + 0.717328196 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.353667e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.408300e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.476909e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.415878e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.488188e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.558288e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.490778 sec - 2,119,348,519 cycles # 2.946 GHz - 3,045,536,225 instructions # 1.44 insn per cycle - 0.776414109 seconds time elapsed +TOTAL : 0.487204 sec + 2,131,239,677 cycles # 3.000 GHz + 3,082,245,234 instructions # 1.45 insn per cycle + 0.768130616 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,9 +86,9 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 31,825,625 cycles # 2.791 GHz - 48,514,379 instructions # 1.52 insn per cycle - 0.011782396 seconds time elapsed + 31,971,805 cycles # 2.811 GHz + 48,583,386 instructions # 1.52 insn per cycle + 0.011876482 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 2f35cf010a..e1663755b4 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-08_21:53:11 +DATE: 2023-11-09_18:11:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.915722e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.200179e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.200179e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.114759e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.213627e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.213627e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.459965 sec - 1,913,489,356 cycles # 2.854 GHz - 2,835,494,218 instructions # 1.48 insn per cycle - 0.728586503 seconds time elapsed +TOTAL : 0.453382 sec + 1,979,110,250 cycles # 2.985 GHz + 2,941,718,851 instructions # 1.49 insn per cycle + 0.719982475 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.767536e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.641642e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.641642e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.789515e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.657512e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.657512e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.634368 sec - 2,553,649,677 cycles # 2.951 GHz - 3,942,242,941 instructions # 1.54 insn per cycle - 0.922459199 seconds time elapsed +TOTAL : 0.632326 sec + 2,585,787,492 cycles # 3.000 GHz + 3,972,159,776 instructions # 1.54 insn per cycle + 0.920056111 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,9 +99,9 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) - 38,286,300 cycles # 2.778 GHz - 51,959,635 instructions # 1.36 insn per cycle - 0.014194921 seconds time elapsed + 38,570,643 cycles # 2.885 GHz + 52,119,941 instructions # 1.35 insn per cycle + 0.013856202 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index e630fbc27d..e8b37410be 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-08_21:22:11 +DATE: 2023-11-09_17:43:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.560442e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.377270e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.470091e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.567326e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.333824e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.424930e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.443339 sec - 1,943,957,931 cycles # 2.944 GHz - 2,765,105,739 instructions # 1.42 insn per cycle - 0.717258208 seconds time elapsed +TOTAL : 0.444883 sec + 1,998,454,742 cycles # 2.980 GHz + 2,813,430,207 instructions # 1.41 insn per cycle + 0.728667460 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.360432e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.412708e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.481720e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.379215e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.422915e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.490315e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.491895 sec - 2,104,648,838 cycles # 2.938 GHz - 3,025,148,863 instructions # 1.44 insn per cycle - 0.773979442 seconds time elapsed +TOTAL : 0.488237 sec + 2,124,585,750 cycles # 2.987 GHz + 3,077,258,575 instructions # 1.45 insn per cycle + 0.769041859 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,9 +86,9 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 31,662,761 cycles # 2.798 GHz - 47,511,797 instructions # 1.50 insn per cycle - 0.011712916 seconds time elapsed + 31,375,066 cycles # 2.814 GHz + 47,697,134 instructions # 1.52 insn per cycle + 0.011523392 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index e83376e827..aa3d979423 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-08_21:22:21 +DATE: 2023-11-09_17:43:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.888685e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.043488e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.056349e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.974532e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.049892e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.062592e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.461575 sec - 1,992,206,499 cycles # 2.947 GHz - 2,868,298,614 instructions # 1.44 insn per cycle - 0.733257197 seconds time elapsed +TOTAL : 0.466235 sec + 1,982,451,794 cycles # 2.881 GHz + 2,904,128,689 instructions # 1.46 insn per cycle + 0.746029193 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.111138e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.315581e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.327177e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.118841e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.320828e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.332362e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.598628 sec - 2,465,744,279 cycles # 2.958 GHz - 3,812,193,472 instructions # 1.55 insn per cycle - 0.893336251 seconds time elapsed +TOTAL : 0.602851 sec + 2,418,002,144 cycles # 2.873 GHz + 3,684,858,061 instructions # 1.52 insn per cycle + 0.899181828 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,9 +86,9 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 34,711,490 cycles # 2.787 GHz - 50,039,456 instructions # 1.44 insn per cycle - 0.012986618 seconds time elapsed + 34,749,440 cycles # 2.771 GHz + 50,090,467 instructions # 1.44 insn per cycle + 0.013126058 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index ab62773e76..fa1b7c54dc 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-08_21:22:30 +DATE: 2023-11-09_17:43:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.840662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.037949e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.050999e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.943854e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.040668e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.053565e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.462948 sec - 1,939,550,045 cycles # 2.866 GHz - 2,822,181,727 instructions # 1.46 insn per cycle - 0.733825753 seconds time elapsed +TOTAL : 0.466224 sec + 1,967,855,817 cycles # 2.845 GHz + 2,813,069,845 instructions # 1.43 insn per cycle + 0.750240924 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.102587e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.303113e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.314475e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.108193e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.305249e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.316509e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.591515 sec - 2,444,078,815 cycles # 2.952 GHz - 3,674,116,474 instructions # 1.50 insn per cycle - 0.887442466 seconds time elapsed +TOTAL : 0.594738 sec + 2,473,036,884 cycles # 2.994 GHz + 3,768,499,475 instructions # 1.52 insn per cycle + 0.886746599 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,9 +86,9 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 34,181,769 cycles # 2.772 GHz - 49,201,973 instructions # 1.44 insn per cycle - 0.012846211 seconds time elapsed + 34,257,253 cycles # 2.793 GHz + 49,140,913 instructions # 1.43 insn per cycle + 0.012667194 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 0e571e2957..5de2ca45d8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:22:40 +DATE: 2023-11-09_17:43:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.509565e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.535938e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.538049e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.498898e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.522792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.525024e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.522429 sec - 2,216,464,510 cycles # 2.948 GHz - 3,445,335,287 instructions # 1.55 insn per cycle - 0.813178007 seconds time elapsed +TOTAL : 0.521868 sec + 2,246,075,293 cycles # 2.975 GHz + 3,415,991,617 instructions # 1.52 insn per cycle + 0.815510814 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.124490e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.152981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.154204e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.122388e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.150135e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.151328e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.028693 sec - 9,700,865,704 cycles # 2.960 GHz - 20,299,179,534 instructions # 2.09 insn per cycle - 3.337900982 seconds time elapsed +TOTAL : 3.026853 sec + 9,913,864,058 cycles # 3.024 GHz + 22,195,735,281 instructions # 2.24 insn per cycle + 3.335346642 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.948157e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.949119e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.949119e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.927075e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.927983e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927983e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.428390 sec - 25,658,286,461 cycles # 3.043 GHz - 78,943,496,553 instructions # 3.08 insn per cycle - 8.432674701 seconds time elapsed +TOTAL : 8.520375 sec + 25,675,362,415 cycles # 3.013 GHz + 78,943,710,554 instructions # 3.07 insn per cycle + 8.524455360 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.638426e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.641828e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.641828e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.557363e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.560585e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.560585e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.516511 sec - 12,940,511,466 cycles # 2.863 GHz - 39,286,083,355 instructions # 3.04 insn per cycle - 4.520821646 seconds time elapsed +TOTAL : 4.619361 sec + 12,935,854,234 cycles # 2.798 GHz + 39,286,025,399 instructions # 3.04 insn per cycle + 4.623706542 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13182) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.063000e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.079398e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.079398e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.091948e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.108522e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.108522e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.043453 sec - 5,578,804,578 cycles # 2.725 GHz - 13,689,979,347 instructions # 2.45 insn per cycle - 2.047766279 seconds time elapsed +TOTAL : 2.036281 sec + 5,584,766,890 cycles # 2.738 GHz + 13,690,141,249 instructions # 2.45 insn per cycle + 2.040702440 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.584845e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.608001e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.608001e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.675809e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.698948e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.698948e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.720447 sec - 4,895,207,627 cycles # 2.839 GHz - 12,344,429,833 instructions # 2.52 insn per cycle - 1.724685286 seconds time elapsed +TOTAL : 1.704074 sec + 4,897,181,740 cycles # 2.868 GHz + 12,344,518,245 instructions # 2.52 insn per cycle + 1.708309061 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.405020e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.418567e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.418567e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.632146e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.645889e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.645889e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.224337 sec - 4,116,450,066 cycles # 1.848 GHz - 6,337,280,624 instructions # 1.54 insn per cycle - 2.228619766 seconds time elapsed +TOTAL : 2.158505 sec + 4,118,735,499 cycles # 1.905 GHz + 6,336,932,858 instructions # 1.54 insn per cycle + 2.162776211 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 6cfffac867..322fb0150d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:53:56 +DATE: 2023-11-09_18:12:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.140206e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.481973e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.481973e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.165662e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.477249e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.477249e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.512248 sec - 2,184,996,199 cycles # 2.952 GHz - 3,435,282,796 instructions # 1.57 insn per cycle - 0.800472589 seconds time elapsed +TOTAL : 0.512387 sec + 2,201,868,575 cycles # 2.980 GHz + 3,430,381,187 instructions # 1.56 insn per cycle + 0.801238529 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.623195e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.099384e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.099384e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.642632e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.111769e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.111769e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.306442 sec - 10,620,771,247 cycles # 2.970 GHz - 24,014,706,294 instructions # 2.26 insn per cycle - 3.633696672 seconds time elapsed +TOTAL : 3.299595 sec + 10,919,109,000 cycles # 3.053 GHz + 24,319,272,982 instructions # 2.23 insn per cycle + 3.633626468 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.935055e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.935984e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.935984e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.957325e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.958258e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.958258e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.489050 sec - 25,665,712,522 cycles # 3.023 GHz - 78,953,227,075 instructions # 3.08 insn per cycle - 8.493532453 seconds time elapsed +TOTAL : 8.392728 sec + 25,662,881,797 cycles # 3.059 GHz + 78,952,840,684 instructions # 3.08 insn per cycle + 8.396994023 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.600578e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.604115e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.604115e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.730470e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.733980e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.733980e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.569107 sec - 12,945,693,806 cycles # 2.831 GHz - 39,298,314,532 instructions # 3.04 insn per cycle - 4.573645709 seconds time elapsed +TOTAL : 4.409754 sec + 12,949,002,647 cycles # 2.934 GHz + 39,297,510,156 instructions # 3.03 insn per cycle + 4.414215325 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13182) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.385455e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.402719e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.402719e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.533999e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.551780e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.551780e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.969364 sec - 5,591,964,229 cycles # 2.834 GHz - 13,700,332,532 instructions # 2.45 insn per cycle - 1.973976640 seconds time elapsed +TOTAL : 1.934795 sec + 5,595,375,698 cycles # 2.886 GHz + 13,699,668,832 instructions # 2.45 insn per cycle + 1.939106700 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.515181e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.538996e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.538996e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.706839e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.728905e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.728905e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.736968 sec - 4,912,884,670 cycles # 2.825 GHz - 12,356,069,233 instructions # 2.52 insn per cycle - 1.741510676 seconds time elapsed +TOTAL : 1.702912 sec + 4,912,481,885 cycles # 2.879 GHz + 12,355,076,796 instructions # 2.52 insn per cycle + 1.707414472 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.401693e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.415615e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.415615e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.525002e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.540499e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.540499e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.229815 sec - 4,139,073,894 cycles # 1.853 GHz - 6,348,807,900 instructions # 1.53 insn per cycle - 2.234437952 seconds time elapsed +TOTAL : 2.193518 sec + 4,132,016,890 cycles # 1.881 GHz + 6,348,500,069 instructions # 1.54 insn per cycle + 2.198089448 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 829db14182..4e138ec032 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_22:05:32 +DATE: 2023-11-09_18:23:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.498087e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.524326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.526521e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.485315e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.511617e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.513675e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.505963 sec - 2,230,602,584 cycles # 2.998 GHz - 3,509,146,743 instructions # 1.57 insn per cycle - 0.814638005 seconds time elapsed +TOTAL : 0.505341 sec + 2,219,350,607 cycles # 2.986 GHz + 3,460,374,619 instructions # 1.56 insn per cycle + 0.811034575 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.138629e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.170285e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.171692e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.144642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.176791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178152e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.117149 sec - 10,263,257,910 cycles # 3.044 GHz - 22,984,843,224 instructions # 2.24 insn per cycle - 3.428387488 seconds time elapsed +TOTAL : 3.133190 sec + 10,226,911,008 cycles # 3.021 GHz + 21,462,701,558 instructions # 2.10 insn per cycle + 3.444111151 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.955282e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.956235e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.956235e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.962376e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.963339e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.963339e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.398336 sec - 25,654,945,707 cycles # 3.057 GHz - 78,946,836,924 instructions # 3.08 insn per cycle - 8.402318295 seconds time elapsed +TOTAL : 8.368138 sec + 25,660,792,563 cycles # 3.066 GHz + 78,945,591,899 instructions # 3.08 insn per cycle + 8.372166508 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.739022e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.742322e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.742322e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.725556e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.729176e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.729176e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.397110 sec - 12,932,706,473 cycles # 2.939 GHz - 39,284,078,298 instructions # 3.04 insn per cycle - 4.401176578 seconds time elapsed +TOTAL : 4.413013 sec + 12,940,530,582 cycles # 2.932 GHz + 39,286,713,275 instructions # 3.04 insn per cycle + 4.417069788 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13182) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.547122e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.565515e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.565515e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.541485e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.558659e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.558659e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.929747 sec - 5,584,587,761 cycles # 2.889 GHz - 13,688,784,163 instructions # 2.45 insn per cycle - 1.933938249 seconds time elapsed +TOTAL : 1.930984 sec + 5,584,027,716 cycles # 2.887 GHz + 13,688,917,418 instructions # 2.45 insn per cycle + 1.935195895 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.712996e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.736353e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.736353e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.785385e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.808420e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.808420e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.699524 sec - 4,899,825,358 cycles # 2.877 GHz - 12,342,496,756 instructions # 2.52 insn per cycle - 1.703963805 seconds time elapsed +TOTAL : 1.686810 sec + 4,897,782,017 cycles # 2.898 GHz + 12,342,341,736 instructions # 2.52 insn per cycle + 1.690859675 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.584277e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.599062e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.599062e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.578298e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.591405e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.591405e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.173644 sec - 4,127,419,767 cycles # 1.897 GHz - 6,336,272,499 instructions # 1.54 insn per cycle - 2.177878840 seconds time elapsed +TOTAL : 2.175070 sec + 4,121,604,366 cycles # 1.892 GHz + 6,334,904,963 instructions # 1.54 insn per cycle + 2.179001381 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 35703491ac..a5bd4bb577 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_22:02:14 +DATE: 2023-11-09_18:20:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.483209e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.509549e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.511610e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.495033e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.521313e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.523414e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.505331 sec - 2,237,004,452 cycles # 3.017 GHz - 3,469,560,739 instructions # 1.55 insn per cycle - 0.813831791 seconds time elapsed +TOTAL : 0.502389 sec + 2,234,960,295 cycles # 3.014 GHz + 3,501,182,478 instructions # 1.57 insn per cycle + 0.813908762 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.137446e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.169549e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.170864e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.146228e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.178481e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.179832e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.063844 sec - 10,025,654,279 cycles # 3.024 GHz - 22,437,691,349 instructions # 2.24 insn per cycle - 3.371428026 seconds time elapsed +TOTAL : 3.070610 sec + 10,014,430,488 cycles # 3.015 GHz + 23,183,698,994 instructions # 2.32 insn per cycle + 3.378407946 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.972408e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.973332e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.973332e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.972782e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.973730e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.973730e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.324018 sec - 25,644,049,472 cycles # 3.080 GHz - 78,945,889,994 instructions # 3.08 insn per cycle - 8.328093218 seconds time elapsed +TOTAL : 8.322572 sec + 25,630,767,892 cycles # 3.079 GHz + 78,944,418,555 instructions # 3.08 insn per cycle + 8.326671797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.757960e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.761409e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.761409e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.718928e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.722195e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.722195e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.373690 sec - 12,932,578,462 cycles # 2.955 GHz - 39,286,223,538 instructions # 3.04 insn per cycle - 4.377750469 seconds time elapsed +TOTAL : 4.419054 sec + 12,933,087,616 cycles # 2.925 GHz + 39,284,437,808 instructions # 3.04 insn per cycle + 4.423270824 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13182) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.504027e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.521553e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.521553e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.554509e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.572221e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.572221e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.937880 sec - 5,579,002,067 cycles # 2.875 GHz - 13,689,941,055 instructions # 2.45 insn per cycle - 1.941926119 seconds time elapsed +TOTAL : 1.926889 sec + 5,576,123,810 cycles # 2.889 GHz + 13,689,166,422 instructions # 2.45 insn per cycle + 1.931047296 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.762551e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.785341e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.785341e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.729620e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.752389e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.752389e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.689011 sec - 4,900,729,891 cycles # 2.896 GHz - 12,344,260,353 instructions # 2.52 insn per cycle - 1.693208802 seconds time elapsed +TOTAL : 1.694985 sec + 4,901,721,494 cycles # 2.886 GHz + 12,344,869,251 instructions # 2.52 insn per cycle + 1.699075447 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.678242e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.692622e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.692622e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.451359e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.465184e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.465184e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.144938 sec - 4,120,050,897 cycles # 1.918 GHz - 6,337,719,473 instructions # 1.54 insn per cycle - 2.149063218 seconds time elapsed +TOTAL : 2.210683 sec + 4,119,158,466 cycles # 1.861 GHz + 6,337,202,754 instructions # 1.54 insn per cycle + 2.214903970 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index e3bb9b2d2b..e1894928b5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:59:02 +DATE: 2023-11-09_18:17:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.202444e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.496466e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.498519e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.185134e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.497070e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.499968e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.507729 sec - 2,224,053,547 cycles # 2.995 GHz - 3,511,447,697 instructions # 1.58 insn per cycle - 0.804264263 seconds time elapsed +TOTAL : 0.512712 sec + 2,117,085,337 cycles # 2.853 GHz + 3,348,083,687 instructions # 1.58 insn per cycle + 0.802067553 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.754243e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.177673e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.179050e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.746826e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.178822e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180194e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.195418 sec - 10,560,218,824 cycles # 3.053 GHz - 23,272,224,469 instructions # 2.20 insn per cycle - 3.516017944 seconds time elapsed +TOTAL : 3.195850 sec + 10,403,522,722 cycles # 3.010 GHz + 22,812,003,731 instructions # 2.19 insn per cycle + 3.513623861 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,14 +94,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.980652e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.981660e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.981660e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.978212e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.979161e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.979161e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.289291 sec - 25,689,530,854 cycles # 3.098 GHz - 78,941,485,494 instructions # 3.07 insn per cycle - 8.293329329 seconds time elapsed +TOTAL : 8.300221 sec + 25,643,059,514 cycles # 3.089 GHz + 78,945,101,648 instructions # 3.08 insn per cycle + 8.304495187 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.695812e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.699396e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.699396e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.720030e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.723443e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.723443e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.446640 sec - 12,939,707,143 cycles # 2.908 GHz - 39,286,790,527 instructions # 3.04 insn per cycle - 4.450934428 seconds time elapsed +TOTAL : 4.417696 sec + 12,936,090,694 cycles # 2.926 GHz + 39,285,549,332 instructions # 3.04 insn per cycle + 4.421886330 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13182) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -148,14 +148,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.540564e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.557570e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.557570e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.467679e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.484549e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.484549e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.929715 sec - 5,584,326,574 cycles # 2.891 GHz - 13,690,307,414 instructions # 2.45 insn per cycle - 1.933841922 seconds time elapsed +TOTAL : 1.946291 sec + 5,575,526,782 cycles # 2.860 GHz + 13,689,232,963 instructions # 2.46 insn per cycle + 1.950526745 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -175,14 +175,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.772043e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.794673e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.794673e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.714029e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.737204e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.737204e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.687536 sec - 4,894,600,072 cycles # 2.895 GHz - 12,345,111,795 instructions # 2.52 insn per cycle - 1.691722733 seconds time elapsed +TOTAL : 1.697675 sec + 4,893,869,630 cycles # 2.877 GHz + 12,345,121,576 instructions # 2.52 insn per cycle + 1.701906664 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -202,14 +202,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.667022e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.680748e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.680748e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.624620e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.638794e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.638794e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.148381 sec - 4,119,534,680 cycles # 1.915 GHz - 6,337,066,991 instructions # 1.54 insn per cycle - 2.152520896 seconds time elapsed +TOTAL : 2.160421 sec + 4,114,771,943 cycles # 1.902 GHz + 6,336,936,596 instructions # 1.54 insn per cycle + 2.164683207 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 2d6466a5d0..d9a60f4c2d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:23:17 +DATE: 2023-11-09_17:44:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.472415e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.497562e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.499582e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.474117e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.499523e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.501625e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.522603 sec - 2,199,879,357 cycles # 2.926 GHz - 3,406,329,945 instructions # 1.55 insn per cycle - 0.812598895 seconds time elapsed +TOTAL : 0.521490 sec + 2,250,098,032 cycles # 2.995 GHz + 3,547,618,625 instructions # 1.58 insn per cycle + 0.811334512 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.151978e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.180898e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.182114e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.144032e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.172097e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.173315e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.012462 sec - 9,824,681,781 cycles # 3.013 GHz - 20,251,773,236 instructions # 2.06 insn per cycle - 3.320916673 seconds time elapsed +TOTAL : 3.014234 sec + 9,779,736,194 cycles # 2.987 GHz + 19,303,224,180 instructions # 1.97 insn per cycle + 3.330161859 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.948786e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.949722e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.949722e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.971538e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.972487e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.972487e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.425425 sec - 25,600,858,897 cycles # 3.038 GHz - 78,714,675,174 instructions # 3.07 insn per cycle - 8.429623210 seconds time elapsed +TOTAL : 8.327938 sec + 25,611,620,219 cycles # 3.074 GHz + 78,715,429,796 instructions # 3.07 insn per cycle + 8.332111280 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.648721e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.652034e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.652034e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.709838e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.713193e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.713193e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.503525 sec - 12,897,071,716 cycles # 2.862 GHz - 39,231,170,693 instructions # 3.04 insn per cycle - 4.507786711 seconds time elapsed +TOTAL : 4.429736 sec + 12,908,947,595 cycles # 2.912 GHz + 39,230,824,629 instructions # 3.04 insn per cycle + 4.433832156 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12949) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.358235e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.375211e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.375211e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.184366e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.200734e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.200734e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.971459 sec - 5,607,121,481 cycles # 2.839 GHz - 13,803,544,350 instructions # 2.46 insn per cycle - 1.975775051 seconds time elapsed +TOTAL : 2.013363 sec + 5,615,451,412 cycles # 2.785 GHz + 13,804,151,174 instructions # 2.46 insn per cycle + 2.017493867 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.338508e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.360185e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.360185e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.496512e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.518383e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.518383e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.768893 sec - 4,962,697,559 cycles # 2.805 GHz - 12,469,802,045 instructions # 2.51 insn per cycle - 1.786199910 seconds time elapsed +TOTAL : 1.736002 sec + 4,961,501,370 cycles # 2.852 GHz + 12,469,539,646 instructions # 2.51 insn per cycle + 1.740286680 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.426426e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.440315e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.440315e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.549305e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.563023e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.563023e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.218010 sec - 4,123,694,980 cycles # 1.856 GHz - 6,461,412,200 instructions # 1.57 insn per cycle - 2.222394946 seconds time elapsed +TOTAL : 2.181875 sec + 4,116,495,870 cycles # 1.884 GHz + 6,461,064,172 instructions # 1.57 insn per cycle + 2.186117492 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index a4e352ee76..909bf4e735 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:43:16 +DATE: 2023-11-09_18:01:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.232524e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.256814e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.259170e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.239370e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.263076e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.265061e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.534744 sec - 2,248,485,126 cycles # 2.941 GHz - 3,494,101,027 instructions # 1.55 insn per cycle - 0.823969121 seconds time elapsed +TOTAL : 0.531588 sec + 2,281,405,083 cycles # 2.976 GHz + 3,558,676,633 instructions # 1.56 insn per cycle + 0.825879944 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.777627e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.804807e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.805966e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.775154e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.802017e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.803118e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.297104 sec - 10,673,501,263 cycles # 3.005 GHz - 24,226,094,920 instructions # 2.27 insn per cycle - 3.607615064 seconds time elapsed +TOTAL : 3.293832 sec + 10,794,008,612 cycles # 3.043 GHz + 23,569,569,961 instructions # 2.18 insn per cycle + 3.607202529 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.346513e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.346993e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.346993e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.420862e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.421336e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.421336e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.741985 sec - 113,582,106,901 cycles # 3.009 GHz - 144,968,769,114 instructions # 1.28 insn per cycle - 37.746219696 seconds time elapsed +TOTAL : 37.106861 sec + 113,630,776,289 cycles # 3.063 GHz + 144,980,863,935 instructions # 1.28 insn per cycle + 37.110990461 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:21605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.143430e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.145919e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.145919e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.245783e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.248348e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.248348e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.226537 sec - 14,726,949,716 cycles # 2.816 GHz - 37,578,521,140 instructions # 2.55 insn per cycle - 5.230978594 seconds time elapsed +TOTAL : 5.061979 sec + 14,717,920,983 cycles # 2.906 GHz + 37,577,837,464 instructions # 2.55 insn per cycle + 5.066177833 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68118) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.619134e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.633428e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.633428e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.791579e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.806069e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.806069e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.162328 sec - 6,132,958,052 cycles # 2.832 GHz - 13,063,746,182 instructions # 2.13 insn per cycle - 2.166766443 seconds time elapsed +TOTAL : 2.114146 sec + 6,120,754,225 cycles # 2.890 GHz + 13,063,521,271 instructions # 2.13 insn per cycle + 2.118343855 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.242664e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.263271e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.263271e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.380050e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.401402e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.401402e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.783918 sec - 5,064,574,027 cycles # 2.835 GHz - 11,442,541,397 instructions # 2.26 insn per cycle - 1.788276031 seconds time elapsed +TOTAL : 1.757697 sec + 5,060,306,566 cycles # 2.873 GHz + 11,442,262,844 instructions # 2.26 insn per cycle + 1.761841609 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.693472e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.708550e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.708550e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.755291e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.769173e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.769173e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.141610 sec - 3,984,341,945 cycles # 1.859 GHz - 5,944,587,769 instructions # 1.49 insn per cycle - 2.145941939 seconds time elapsed +TOTAL : 2.124539 sec + 3,983,245,523 cycles # 1.872 GHz + 5,944,184,553 instructions # 1.49 insn per cycle + 2.128814459 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index c9a3c0bc00..8be167a2b3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:44:25 +DATE: 2023-11-09_18:02:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.238547e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.263632e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.265593e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.258787e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.282651e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.285304e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.528864 sec - 2,246,214,726 cycles # 2.961 GHz - 3,512,868,349 instructions # 1.56 insn per cycle - 0.816400547 seconds time elapsed +TOTAL : 0.525087 sec + 2,271,033,028 cycles # 3.019 GHz + 3,503,626,972 instructions # 1.54 insn per cycle + 0.810303022 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.792504e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.819675e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.820783e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.795218e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.822430e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.823559e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.270254 sec - 10,633,900,320 cycles # 3.014 GHz - 24,514,837,826 instructions # 2.31 insn per cycle - 3.584387558 seconds time elapsed +TOTAL : 3.267432 sec + 10,775,752,309 cycles # 3.062 GHz + 23,804,895,620 instructions # 2.21 insn per cycle + 3.575584891 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.327617e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.328084e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.328084e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.382161e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.382658e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.382658e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.906136 sec - 114,405,747,001 cycles # 3.018 GHz - 145,562,165,740 instructions # 1.27 insn per cycle - 37.910396057 seconds time elapsed +TOTAL : 37.434460 sec + 114,573,902,263 cycles # 3.060 GHz + 145,559,795,063 instructions # 1.27 insn per cycle + 37.438717752 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:22248) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.120905e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.123383e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.123383e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.172461e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.174968e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.174968e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.264434 sec - 15,164,870,179 cycles # 2.879 GHz - 37,765,103,372 instructions # 2.49 insn per cycle - 5.268658441 seconds time elapsed +TOTAL : 5.178309 sec + 15,150,664,399 cycles # 2.924 GHz + 37,765,142,558 instructions # 2.49 insn per cycle + 5.182585019 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68446) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.815263e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.829969e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.829969e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.899691e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.915108e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.915108e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.107998 sec - 6,006,546,140 cycles # 2.845 GHz - 12,898,448,008 instructions # 2.15 insn per cycle - 2.112261899 seconds time elapsed +TOTAL : 2.085123 sec + 6,007,372,451 cycles # 2.876 GHz + 12,897,891,125 instructions # 2.15 insn per cycle + 2.089322243 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.170106e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.191645e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.191645e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.290925e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.312116e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.312116e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.798019 sec - 5,110,595,937 cycles # 2.837 GHz - 11,448,746,145 instructions # 2.24 insn per cycle - 1.802331588 seconds time elapsed +TOTAL : 1.774574 sec + 5,109,183,395 cycles # 2.874 GHz + 11,448,665,866 instructions # 2.24 insn per cycle + 1.778819443 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.719086e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.733849e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.733849e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.900466e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.915540e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.915540e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.134583 sec - 3,969,461,110 cycles # 1.857 GHz - 5,897,831,571 instructions # 1.49 insn per cycle - 2.138816528 seconds time elapsed +TOTAL : 2.085227 sec + 3,957,731,000 cycles # 1.895 GHz + 5,897,967,734 instructions # 1.49 insn per cycle + 2.089481596 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 9c1de01f16..24e6fadbe8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:23:53 +DATE: 2023-11-09_17:45:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.293342e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.339166e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.344348e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.337209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.383457e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.391632e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.481289 sec - 2,043,429,418 cycles # 2.945 GHz - 3,016,391,404 instructions # 1.48 insn per cycle - 0.753087040 seconds time elapsed +TOTAL : 0.480161 sec + 2,056,195,749 cycles # 2.969 GHz + 3,041,501,171 instructions # 1.48 insn per cycle + 0.751973888 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.613713e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.676727e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.679629e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.613057e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.675362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.678111e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.713007 sec - 5,846,211,987 cycles # 2.997 GHz - 12,059,135,892 instructions # 2.06 insn per cycle - 2.007812305 seconds time elapsed +TOTAL : 1.713246 sec + 5,908,983,228 cycles # 3.045 GHz + 11,684,311,184 instructions # 1.98 insn per cycle + 1.997404675 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.005115e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.006106e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.006106e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.054709e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.055772e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.055772e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.187511 sec - 24,627,671,323 cycles # 3.007 GHz - 78,134,663,224 instructions # 3.17 insn per cycle - 8.191568767 seconds time elapsed +TOTAL : 7.990756 sec + 24,645,365,645 cycles # 3.083 GHz + 78,136,702,059 instructions # 3.17 insn per cycle + 7.994878538 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.313136e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.326827e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.326827e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.432830e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.446994e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.446994e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.250414 sec - 6,477,846,372 cycles # 2.874 GHz - 20,124,481,745 instructions # 3.11 insn per cycle - 2.254575609 seconds time elapsed +TOTAL : 2.213938 sec + 6,478,911,538 cycles # 2.922 GHz + 20,124,199,414 instructions # 3.11 insn per cycle + 2.218115274 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.651750e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.658578e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.658578e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.680617e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.687674e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.687674e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.000733 sec - 2,836,203,846 cycles # 2.824 GHz - 6,991,580,060 instructions # 2.47 insn per cycle - 1.005051926 seconds time elapsed +TOTAL : 0.983889 sec + 2,838,821,051 cycles # 2.875 GHz + 6,991,598,423 instructions # 2.46 insn per cycle + 0.988065526 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.891596e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.900607e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.900607e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.841366e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.850029e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850029e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.874979 sec - 2,489,876,695 cycles # 2.834 GHz - 6,298,919,091 instructions # 2.53 insn per cycle - 0.879145628 seconds time elapsed +TOTAL : 0.898688 sec + 2,488,990,380 cycles # 2.759 GHz + 6,298,918,188 instructions # 2.53 insn per cycle + 0.902843603 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.492404e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.498044e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.498044e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.538961e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.547910e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.547910e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.107211 sec - 2,056,905,721 cycles # 1.852 GHz - 3,268,863,177 instructions # 1.59 insn per cycle - 1.111361855 seconds time elapsed +TOTAL : 1.073829 sec + 2,048,858,820 cycles # 1.904 GHz + 3,269,526,835 instructions # 1.60 insn per cycle + 1.078196054 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2415) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 7ef08eb1a1..741b2db05e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:54:33 +DATE: 2023-11-09_18:12:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.630785e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.310772e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.310772e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.661835e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.358766e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.358766e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.466074 sec - 1,998,575,796 cycles # 2.933 GHz - 2,994,965,957 instructions # 1.50 insn per cycle - 0.738328183 seconds time elapsed +TOTAL : 0.465492 sec + 2,015,187,483 cycles # 2.973 GHz + 3,002,049,942 instructions # 1.49 insn per cycle + 0.734544576 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.261662e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.481805e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.481805e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.271779e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.483162e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.483162e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.889932 sec - 6,363,844,307 cycles # 2.984 GHz - 13,005,964,068 instructions # 2.04 insn per cycle - 2.191280597 seconds time elapsed +TOTAL : 1.878261 sec + 6,418,416,087 cycles # 3.037 GHz + 13,442,701,753 instructions # 2.09 insn per cycle + 2.169965161 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.002381e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.003373e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.003373e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.022346e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.023320e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.023320e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.200661 sec - 24,662,776,052 cycles # 3.006 GHz - 78,138,608,532 instructions # 3.17 insn per cycle - 8.204934256 seconds time elapsed +TOTAL : 8.120106 sec + 24,656,495,142 cycles # 3.035 GHz + 78,138,532,268 instructions # 3.17 insn per cycle + 8.124268827 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.306848e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.320652e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.320652e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.385899e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.400170e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.400170e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.255006 sec - 6,482,848,456 cycles # 2.870 GHz - 20,133,573,977 instructions # 3.11 insn per cycle - 2.259320427 seconds time elapsed +TOTAL : 2.230708 sec + 6,485,115,953 cycles # 2.903 GHz + 20,133,634,822 instructions # 3.10 insn per cycle + 2.234788671 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.648854e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.655690e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.655690e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.666755e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.673825e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.673825e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.005313 sec - 2,849,286,060 cycles # 2.824 GHz - 7,001,856,779 instructions # 2.46 insn per cycle - 1.009712120 seconds time elapsed +TOTAL : 0.994493 sec + 2,844,577,237 cycles # 2.850 GHz + 7,001,609,472 instructions # 2.46 insn per cycle + 0.998731395 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.888498e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.898036e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.898036e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.867923e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.876610e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.876610e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.879137 sec - 2,499,075,063 cycles # 2.831 GHz - 6,309,019,763 instructions # 2.52 insn per cycle - 0.883537991 seconds time elapsed +TOTAL : 0.888528 sec + 2,499,243,226 cycles # 2.802 GHz + 6,308,730,841 instructions # 2.52 insn per cycle + 0.892798888 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.493195e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.498802e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.498802e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.495920e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.501735e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.501735e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.109448 sec - 2,060,050,205 cycles # 1.851 GHz - 3,279,571,633 instructions # 1.59 insn per cycle - 1.113744599 seconds time elapsed +TOTAL : 1.107724 sec + 2,056,932,102 cycles # 1.850 GHz + 3,279,291,488 instructions # 1.59 insn per cycle + 1.112281401 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2415) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 4d664fc4d6..341f303aae 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_22:06:09 +DATE: 2023-11-09_18:24:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.355118e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.403966e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.409182e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.311526e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.361390e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.366448e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.462753 sec - 2,014,223,981 cycles # 2.997 GHz - 3,038,538,632 instructions # 1.51 insn per cycle - 0.729935758 seconds time elapsed +TOTAL : 0.464707 sec + 2,008,078,996 cycles # 2.985 GHz + 3,036,723,964 instructions # 1.51 insn per cycle + 0.732085987 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.565713e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.634653e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.637720e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.547836e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.616999e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.620197e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.796992 sec - 6,172,975,790 cycles # 3.046 GHz - 13,083,554,495 instructions # 2.12 insn per cycle - 2.086223865 seconds time elapsed +TOTAL : 1.809045 sec + 6,020,726,960 cycles # 2.958 GHz + 11,569,273,710 instructions # 1.92 insn per cycle + 2.092173630 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.045430e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.046401e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.046401e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.048605e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.049604e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.049604e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.027601 sec - 24,633,930,216 cycles # 3.068 GHz - 78,134,736,063 instructions # 3.17 insn per cycle - 8.031555788 seconds time elapsed +TOTAL : 8.015102 sec + 24,651,277,493 cycles # 3.074 GHz + 78,133,763,667 instructions # 3.17 insn per cycle + 8.018994302 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.461058e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.474893e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.474893e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.377691e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.391250e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.391250e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.206755 sec - 6,481,821,994 cycles # 2.933 GHz - 20,123,351,594 instructions # 3.10 insn per cycle - 2.210721958 seconds time elapsed +TOTAL : 2.232285 sec + 6,481,088,653 cycles # 2.899 GHz + 20,124,382,938 instructions # 3.11 insn per cycle + 2.236275849 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.665888e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.672800e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.672800e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.686029e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.693351e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.693351e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.994258 sec - 2,841,630,041 cycles # 2.848 GHz - 6,990,811,149 instructions # 2.46 insn per cycle - 0.998209890 seconds time elapsed +TOTAL : 0.981416 sec + 2,838,446,580 cycles # 2.882 GHz + 6,989,000,726 instructions # 2.46 insn per cycle + 0.985356553 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.891296e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.900721e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.900721e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.921238e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.930307e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.930307e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.876906 sec - 2,495,700,726 cycles # 2.835 GHz - 6,297,076,618 instructions # 2.52 insn per cycle - 0.880948978 seconds time elapsed +TOTAL : 0.863261 sec + 2,495,681,706 cycles # 2.880 GHz + 6,297,112,783 instructions # 2.52 insn per cycle + 0.867346097 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.552086e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.558027e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.558027e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.544822e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.550907e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.550907e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.065622 sec - 2,049,379,894 cycles # 1.917 GHz - 3,265,032,857 instructions # 1.59 insn per cycle - 1.069477010 seconds time elapsed +TOTAL : 1.070627 sec + 2,048,550,465 cycles # 1.908 GHz + 3,265,201,106 instructions # 1.59 insn per cycle + 1.074629445 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2415) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index ee315233c1..63178ad027 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_22:02:50 +DATE: 2023-11-09_18:21:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.328542e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.377951e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.383103e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.362546e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.415600e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.420893e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.460444 sec - 2,025,388,470 cycles # 3.016 GHz - 3,026,490,924 instructions # 1.49 insn per cycle - 0.728886791 seconds time elapsed +TOTAL : 0.460752 sec + 2,005,673,830 cycles # 2.989 GHz + 2,996,841,960 instructions # 1.49 insn per cycle + 0.729795900 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.561347e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.630349e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.633332e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.567426e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.636917e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.639909e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.742876 sec - 6,025,656,195 cycles # 3.063 GHz - 13,153,972,386 instructions # 2.18 insn per cycle - 2.023922647 seconds time elapsed +TOTAL : 1.748848 sec + 5,960,134,043 cycles # 3.018 GHz + 12,821,096,532 instructions # 2.15 insn per cycle + 2.031326515 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.049881e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.050905e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.050905e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.057810e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.058811e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.058811e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.008804 sec - 24,622,379,845 cycles # 3.073 GHz - 78,134,077,156 instructions # 3.17 insn per cycle - 8.012721206 seconds time elapsed +TOTAL : 7.977700 sec + 24,629,048,089 cycles # 3.086 GHz + 78,132,914,520 instructions # 3.17 insn per cycle + 7.981637101 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.445321e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.458917e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.458917e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.439696e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.453635e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.453635e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.210782 sec - 6,475,852,782 cycles # 2.925 GHz - 20,124,175,553 instructions # 3.11 insn per cycle - 2.214842110 seconds time elapsed +TOTAL : 2.212183 sec + 6,477,339,632 cycles # 2.924 GHz + 20,124,428,604 instructions # 3.11 insn per cycle + 2.216339188 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.697514e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.704851e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.704851e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.594939e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.601395e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.601395e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.973836 sec - 2,835,149,001 cycles # 2.901 GHz - 6,991,410,852 instructions # 2.47 insn per cycle - 0.977864307 seconds time elapsed +TOTAL : 1.036317 sec + 2,842,114,214 cycles # 2.733 GHz + 6,991,999,004 instructions # 2.46 insn per cycle + 1.040742925 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.934817e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.944385e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.944385e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.922697e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.931896e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.931896e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.855140 sec - 2,487,419,693 cycles # 2.897 GHz - 6,298,706,089 instructions # 2.53 insn per cycle - 0.859052723 seconds time elapsed +TOTAL : 0.860619 sec + 2,490,053,798 cycles # 2.883 GHz + 6,298,956,842 instructions # 2.53 insn per cycle + 0.864591382 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.555511e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.561377e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.561377e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.526848e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.532542e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.532542e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.062258 sec - 2,048,558,209 cycles # 1.923 GHz - 3,268,764,234 instructions # 1.60 insn per cycle - 1.066272803 seconds time elapsed +TOTAL : 1.082468 sec + 2,049,657,294 cycles # 1.888 GHz + 3,269,097,732 instructions # 1.59 insn per cycle + 1.086487061 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2415) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index efdbcfe1ae..2548057249 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:59:38 +DATE: 2023-11-09_18:17:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.758974e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.368878e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.373916e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.733376e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.369757e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.375069e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.462598 sec - 2,002,083,704 cycles # 2.975 GHz - 3,028,559,110 instructions # 1.51 insn per cycle - 0.730010364 seconds time elapsed +TOTAL : 0.461931 sec + 2,018,026,618 cycles # 3.001 GHz + 3,012,517,263 instructions # 1.49 insn per cycle + 0.729223266 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.506677e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.634226e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.637242e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.494168e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.614081e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.617046e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.818599 sec - 6,254,092,117 cycles # 3.058 GHz - 12,631,559,563 instructions # 2.02 insn per cycle - 2.110653596 seconds time elapsed +TOTAL : 1.820293 sec + 6,248,771,087 cycles # 3.054 GHz + 13,452,131,003 instructions # 2.15 insn per cycle + 2.111868581 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,14 +94,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.065897e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.066912e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.066912e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.046417e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.047425e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.047425e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.946634 sec - 24,618,185,681 cycles # 3.097 GHz - 78,133,594,453 instructions # 3.17 insn per cycle - 7.950536612 seconds time elapsed +TOTAL : 8.022116 sec + 24,641,165,344 cycles # 3.070 GHz + 78,133,947,271 instructions # 3.17 insn per cycle + 8.026095295 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.469422e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.483642e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.483642e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.418320e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.431601e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.431601e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.203521 sec - 6,477,304,059 cycles # 2.935 GHz - 20,124,231,259 instructions # 3.11 insn per cycle - 2.207560981 seconds time elapsed +TOTAL : 2.218818 sec + 6,476,858,939 cycles # 2.915 GHz + 20,124,080,031 instructions # 3.11 insn per cycle + 2.222978465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -148,14 +148,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.692268e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.699231e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.699231e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.673631e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.680333e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.680333e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.976738 sec - 2,836,504,426 cycles # 2.894 GHz - 6,991,415,909 instructions # 2.46 insn per cycle - 0.980720950 seconds time elapsed +TOTAL : 0.987621 sec + 2,839,487,470 cycles # 2.865 GHz + 6,991,564,753 instructions # 2.46 insn per cycle + 0.991606693 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -175,14 +175,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.804638e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.812886e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.812886e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.883610e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.892494e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.892494e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.917702 sec - 2,493,684,467 cycles # 2.707 GHz - 6,299,926,195 instructions # 2.53 insn per cycle - 0.922017124 seconds time elapsed +TOTAL : 0.878347 sec + 2,488,399,526 cycles # 2.822 GHz + 6,298,882,599 instructions # 2.53 insn per cycle + 0.882234875 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -202,14 +202,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.542695e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.548647e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.548647e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.534627e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.540393e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.540393e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.070949 sec - 2,049,167,689 cycles # 1.907 GHz - 3,268,610,487 instructions # 1.60 insn per cycle - 1.074921168 seconds time elapsed +TOTAL : 1.076545 sec + 2,047,724,498 cycles # 1.897 GHz + 3,268,770,442 instructions # 1.60 insn per cycle + 1.080450235 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2415) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index afc8dc6250..3e46ada377 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:24:23 +DATE: 2023-11-09_17:45:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.334864e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.384627e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.390200e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.305671e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.350688e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.358120e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.478485 sec - 2,037,012,252 cycles # 2.938 GHz - 3,030,553,414 instructions # 1.49 insn per cycle - 0.751162438 seconds time elapsed +TOTAL : 0.483459 sec + 2,029,909,968 cycles # 2.855 GHz + 2,962,980,745 instructions # 1.46 insn per cycle + 0.768081643 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.576633e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.638822e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.641657e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.574581e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.636147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.638743e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.723660 sec - 5,841,021,027 cycles # 2.992 GHz - 11,140,232,262 instructions # 1.91 insn per cycle - 2.010396879 seconds time elapsed +TOTAL : 1.716632 sec + 5,921,965,881 cycles # 3.044 GHz + 11,852,981,757 instructions # 2.00 insn per cycle + 2.001901523 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.020250e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.021294e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.021294e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.062728e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.063773e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.063773e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.126388 sec - 24,531,986,763 cycles # 3.018 GHz - 77,860,700,825 instructions # 3.17 insn per cycle - 8.130365170 seconds time elapsed +TOTAL : 7.958572 sec + 24,559,190,224 cycles # 3.085 GHz + 77,859,989,303 instructions # 3.17 insn per cycle + 7.962642501 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3113) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.508420e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.523945e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.523945e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.583566e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.598037e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.598037e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.192196 sec - 6,417,749,314 cycles # 2.923 GHz - 20,089,444,717 instructions # 3.13 insn per cycle - 2.196603069 seconds time elapsed +TOTAL : 2.170856 sec + 6,426,627,449 cycles # 2.956 GHz + 20,090,039,565 instructions # 3.13 insn per cycle + 2.175014616 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.619246e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.625936e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.625936e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.591188e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.597484e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.597484e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.020667 sec - 2,904,857,639 cycles # 2.836 GHz - 7,133,491,112 instructions # 2.46 insn per cycle - 1.024733034 seconds time elapsed +TOTAL : 1.038604 sec + 2,902,688,212 cycles # 2.785 GHz + 7,133,529,057 instructions # 2.46 insn per cycle + 1.042821386 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.807219e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.815471e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.815471e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.840190e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.848739e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.848739e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.915311 sec - 2,597,440,177 cycles # 2.827 GHz - 6,442,073,160 instructions # 2.48 insn per cycle - 0.919440444 seconds time elapsed +TOTAL : 0.898885 sec + 2,595,883,470 cycles # 2.877 GHz + 6,441,979,586 instructions # 2.48 insn per cycle + 0.902832877 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.330502e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.335014e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.335014e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.492137e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.497778e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.497778e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.241025 sec - 2,122,770,451 cycles # 1.706 GHz - 3,430,866,539 instructions # 1.62 insn per cycle - 1.245371552 seconds time elapsed +TOTAL : 1.106744 sec + 2,123,250,955 cycles # 1.918 GHz + 3,431,574,417 instructions # 1.62 insn per cycle + 1.110853762 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2912) (512y: 22) (512z: 9647) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 86542f0b70..764181f824 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:45:34 +DATE: 2023-11-09_18:03:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.570490e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.610069e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.614296e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.601175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.638676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.643535e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.491125 sec - 2,098,886,797 cycles # 2.948 GHz - 3,121,764,784 instructions # 1.49 insn per cycle - 0.773983413 seconds time elapsed +TOTAL : 0.484907 sec + 2,101,276,759 cycles # 2.981 GHz + 3,149,706,582 instructions # 1.50 insn per cycle + 0.766736785 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.716470e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.775515e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.778049e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.695736e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.752372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.754868e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.856510 sec - 6,241,842,396 cycles # 2.982 GHz - 13,362,161,836 instructions # 2.14 insn per cycle - 2.150637345 seconds time elapsed +TOTAL : 1.853224 sec + 6,303,125,801 cycles # 3.016 GHz + 12,982,819,660 instructions # 2.06 insn per cycle + 2.146815240 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.736455e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.737287e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.737287e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.841033e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.841866e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.841866e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.600113 sec - 86,425,718,035 cycles # 3.022 GHz - 135,574,556,258 instructions # 1.57 insn per cycle - 28.604413837 seconds time elapsed +TOTAL : 28.085663 sec + 86,167,672,431 cycles # 3.068 GHz + 135,565,357,772 instructions # 1.57 insn per cycle + 28.089696347 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:15486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.030289e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.043211e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.043211e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.152037e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.164422e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.164422e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.341197 sec - 6,779,953,097 cycles # 2.892 GHz - 19,387,529,866 instructions # 2.86 insn per cycle - 2.345543121 seconds time elapsed +TOTAL : 2.302124 sec + 6,785,316,910 cycles # 2.944 GHz + 19,388,398,647 instructions # 2.86 insn per cycle + 2.306338036 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69680) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.479111e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.484786e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.484786e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.500496e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.506041e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.506041e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.117197 sec - 3,179,595,887 cycles # 2.837 GHz - 6,808,760,792 instructions # 2.14 insn per cycle - 1.121370768 seconds time elapsed +TOTAL : 1.100781 sec + 3,177,227,261 cycles # 2.877 GHz + 6,808,813,623 instructions # 2.14 insn per cycle + 1.104867562 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.783416e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.791440e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.791440e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.797362e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.805452e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.805452e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.927417 sec - 2,649,120,857 cycles # 2.846 GHz - 5,987,099,017 instructions # 2.26 insn per cycle - 0.931540821 seconds time elapsed +TOTAL : 0.920545 sec + 2,652,149,170 cycles # 2.870 GHz + 5,986,924,086 instructions # 2.26 insn per cycle + 0.924698406 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.490502e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.495988e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.495988e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.476030e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.481355e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.481355e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.108557 sec - 2,075,562,698 cycles # 1.867 GHz - 3,501,563,321 instructions # 1.69 insn per cycle - 1.112823809 seconds time elapsed +TOTAL : 1.119541 sec + 2,077,679,044 cycles # 1.851 GHz + 3,501,921,791 instructions # 1.69 insn per cycle + 1.123804705 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5198) (512y: 3) (512z:44822) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 4737cdf8e3..7b7c373ccc 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:46:27 +DATE: 2023-11-09_18:04:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.528505e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.572699e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.577185e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.541471e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.579175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.583358e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.485528 sec - 2,086,161,680 cycles # 2.950 GHz - 3,149,356,396 instructions # 1.51 insn per cycle - 0.766853446 seconds time elapsed +TOTAL : 0.484837 sec + 2,105,287,248 cycles # 2.990 GHz + 3,132,361,933 instructions # 1.49 insn per cycle + 0.765772342 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.640879e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.699452e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.702171e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.694480e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.751016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.753615e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.863196 sec - 6,301,645,470 cycles # 3.002 GHz - 12,163,417,933 instructions # 1.93 insn per cycle - 2.157068829 seconds time elapsed +TOTAL : 1.853183 sec + 6,341,276,975 cycles # 3.036 GHz + 13,434,801,047 instructions # 2.12 insn per cycle + 2.144878674 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.763152e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.763992e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.763992e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.834166e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.834994e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.834994e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.466782 sec - 86,160,161,464 cycles # 3.027 GHz - 135,907,402,983 instructions # 1.58 insn per cycle - 28.470931551 seconds time elapsed +TOTAL : 28.118822 sec + 86,081,697,198 cycles # 3.062 GHz + 135,906,074,576 instructions # 1.58 insn per cycle + 28.122852922 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:15910) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.954712e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.967174e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.967174e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.132688e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.145964e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.145964e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.366132 sec - 6,848,483,827 cycles # 2.890 GHz - 19,440,750,063 instructions # 2.84 insn per cycle - 2.370332980 seconds time elapsed +TOTAL : 2.306989 sec + 6,845,463,882 cycles # 2.963 GHz + 19,440,308,006 instructions # 2.84 insn per cycle + 2.311118522 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69722) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.511072e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.516863e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.516863e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.544215e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.549994e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.549994e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.093285 sec - 3,106,954,835 cycles # 2.833 GHz - 6,720,019,206 instructions # 2.16 insn per cycle - 1.097556495 seconds time elapsed +TOTAL : 1.069611 sec + 3,120,065,313 cycles # 2.908 GHz + 6,719,636,670 instructions # 2.15 insn per cycle + 1.073683656 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.791720e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.799978e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.799978e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.829756e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.837937e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.837937e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.924560 sec - 2,625,881,689 cycles # 2.831 GHz - 5,970,468,600 instructions # 2.27 insn per cycle - 0.928699193 seconds time elapsed +TOTAL : 0.904097 sec + 2,625,695,846 cycles # 2.892 GHz + 5,970,269,399 instructions # 2.27 insn per cycle + 0.908318447 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.485772e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.491338e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.491338e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.517896e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.523661e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.523661e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.112143 sec - 2,079,682,688 cycles # 1.864 GHz - 3,494,926,799 instructions # 1.68 insn per cycle - 1.116310984 seconds time elapsed +TOTAL : 1.088404 sec + 2,079,379,564 cycles # 1.905 GHz + 3,494,888,851 instructions # 1.68 insn per cycle + 1.092417864 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4162) (512y: 4) (512z:44465) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 0d88057431..93a0b75f12 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:24:52 +DATE: 2023-11-09_17:46:11 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.461953e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.486921e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.488984e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.470867e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.494695e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.496793e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.524274 sec - 2,213,988,684 cycles # 2.939 GHz - 3,460,274,141 instructions # 1.56 insn per cycle - 0.814878779 seconds time elapsed +TOTAL : 0.522613 sec + 2,231,403,620 cycles # 2.972 GHz + 3,427,736,994 instructions # 1.54 insn per cycle + 0.812895260 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.131317e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.159899e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.161114e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.127962e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.155846e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.156998e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.024560 sec - 9,783,019,983 cycles # 2.986 GHz - 21,052,355,005 instructions # 2.15 insn per cycle - 3.333798384 seconds time elapsed +TOTAL : 3.024603 sec + 10,040,286,484 cycles # 3.065 GHz + 20,701,312,854 instructions # 2.06 insn per cycle + 3.332453984 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.908400e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909295e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909295e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.954833e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.955774e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.955774e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.603127 sec - 25,922,951,314 cycles # 3.012 GHz - 79,444,287,848 instructions # 3.06 insn per cycle - 8.607377110 seconds time elapsed +TOTAL : 8.399305 sec + 25,922,061,106 cycles # 3.085 GHz + 79,443,494,538 instructions # 3.06 insn per cycle + 8.403427486 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4857) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.601676e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.605199e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.605199e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.761504e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.765123e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.765123e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.563626 sec - 12,670,494,381 cycles # 2.774 GHz - 38,555,115,428 instructions # 3.04 insn per cycle - 4.567958025 seconds time elapsed +TOTAL : 4.369308 sec + 12,659,894,478 cycles # 2.895 GHz + 38,554,080,405 instructions # 3.05 insn per cycle + 4.373596593 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.436133e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.453065e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.453065e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.648175e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.665781e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.665781e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.953575 sec - 5,515,640,809 cycles # 2.818 GHz - 13,484,131,277 instructions # 2.44 insn per cycle - 1.957940467 seconds time elapsed +TOTAL : 1.905268 sec + 5,516,001,376 cycles # 2.890 GHz + 13,483,921,346 instructions # 2.44 insn per cycle + 1.909531551 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.530089e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.553433e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.553433e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.803935e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.827738e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.827738e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.730211 sec - 4,882,100,767 cycles # 2.816 GHz - 12,140,913,078 instructions # 2.49 insn per cycle - 1.734496344 seconds time elapsed +TOTAL : 1.682277 sec + 4,871,353,432 cycles # 2.890 GHz + 12,140,803,788 instructions # 2.49 insn per cycle + 1.686455915 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.332978e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.346275e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.346275e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.374652e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.387771e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.387771e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.246181 sec - 4,144,338,295 cycles # 1.842 GHz - 6,339,235,304 instructions # 1.53 insn per cycle - 2.250536993 seconds time elapsed +TOTAL : 2.233464 sec + 4,145,054,475 cycles # 1.853 GHz + 6,339,255,297 instructions # 1.53 insn per cycle + 2.237809120 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1802) (512y: 93) (512z: 9358) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 154c33870f..5c4ca592f3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-08_21:25:29 +DATE: 2023-11-09_17:46:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.466139e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.491413e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.493568e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.487617e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.512149e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.514706e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.523033 sec - 2,231,792,351 cycles # 2.947 GHz - 3,493,743,246 instructions # 1.57 insn per cycle - 0.817222718 seconds time elapsed +TOTAL : 0.518884 sec + 2,241,934,817 cycles # 2.999 GHz + 3,518,298,272 instructions # 1.57 insn per cycle + 0.808606683 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134865e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.163582e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.164827e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.131184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.159088e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.160252e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.022684 sec - 9,525,982,822 cycles # 2.907 GHz - 21,759,904,749 instructions # 2.28 insn per cycle - 3.333718015 seconds time elapsed +TOTAL : 3.016798 sec + 10,040,228,896 cycles # 3.072 GHz + 22,037,859,926 instructions # 2.19 insn per cycle + 3.324922224 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.890125e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.891036e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.891036e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.950722e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.951656e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.951656e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.687196 sec - 25,936,497,205 cycles # 2.985 GHz - 79,455,431,598 instructions # 3.06 insn per cycle - 8.691442955 seconds time elapsed +TOTAL : 8.416183 sec + 25,916,224,646 cycles # 3.078 GHz + 79,453,865,963 instructions # 3.07 insn per cycle + 8.420247127 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4504) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.674580e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.678053e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.678053e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.759672e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.763188e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.763188e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.473580 sec - 12,663,684,829 cycles # 2.829 GHz - 38,526,072,859 instructions # 3.04 insn per cycle - 4.477928329 seconds time elapsed +TOTAL : 4.371398 sec + 12,639,801,464 cycles # 2.889 GHz + 38,524,761,271 instructions # 3.05 insn per cycle + 4.375560053 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12928) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.447225e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.464376e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.464376e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.630529e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.648410e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.648410e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.950551 sec - 5,554,043,311 cycles # 2.842 GHz - 13,609,444,575 instructions # 2.45 insn per cycle - 1.954818500 seconds time elapsed +TOTAL : 1.909613 sec + 5,559,227,570 cycles # 2.906 GHz + 13,609,303,550 instructions # 2.45 insn per cycle + 1.913823155 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.528912e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.551046e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.551046e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.332740e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.353313e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.353313e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.730043 sec - 4,918,299,350 cycles # 2.837 GHz - 12,276,281,852 instructions # 2.50 insn per cycle - 1.734286887 seconds time elapsed +TOTAL : 1.766447 sec + 4,917,170,589 cycles # 2.778 GHz + 12,276,136,667 instructions # 2.50 insn per cycle + 1.770689432 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.227160e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.239598e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.239598e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.605174e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.618655e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.618655e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.278650 sec - 4,148,690,065 cycles # 1.818 GHz - 6,446,007,726 instructions # 1.55 insn per cycle - 2.282996103 seconds time elapsed +TOTAL : 2.166306 sec + 4,144,641,386 cycles # 1.911 GHz + 6,445,298,096 instructions # 1.56 insn per cycle + 2.170508580 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1627) (512y: 191) (512z: 9356) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index f7c4424904..b73b517066 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-08_21:27:51 +DATE: 2023-11-09_17:49:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.070515e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.070905e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.071008e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.070656e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.071067e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.071174e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.420963 sec - 8,223,258,722 cycles # 3.000 GHz - 17,670,197,130 instructions # 2.15 insn per cycle - 2.797812392 seconds time elapsed +TOTAL : 2.421343 sec + 8,332,807,450 cycles # 3.040 GHz + 16,939,230,243 instructions # 2.03 insn per cycle + 2.799270804 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.267469e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.269461e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.269740e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.271200e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.273122e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.273304e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.983357 sec - 12,890,762,548 cycles # 2.986 GHz - 28,149,713,448 instructions # 2.18 insn per cycle - 4.374511500 seconds time elapsed +TOTAL : 3.985063 sec + 13,247,174,015 cycles # 3.069 GHz + 30,019,215,878 instructions # 2.27 insn per cycle + 4.374841890 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.327736e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.327962e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.327962e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.228283e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.228511e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.228511e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.345395 sec - 18,808,426,055 cycles # 2.963 GHz - 53,915,859,593 instructions # 2.87 insn per cycle - 6.349306785 seconds time elapsed +TOTAL : 6.424546 sec + 18,798,364,918 cycles # 2.925 GHz + 53,916,162,526 instructions # 2.87 insn per cycle + 6.428517349 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.631387e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.631477e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.631477e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.657858e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.657947e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.657947e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.247242 sec - 9,798,431,936 cycles # 3.015 GHz - 27,093,078,884 instructions # 2.77 insn per cycle - 3.251306892 seconds time elapsed +TOTAL : 3.191098 sec + 9,844,225,763 cycles # 3.082 GHz + 27,092,778,504 instructions # 2.75 insn per cycle + 3.195159677 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96441) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.527269e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.527671e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.527671e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.638511e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.638939e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.638939e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.502062 sec - 4,254,510,227 cycles # 2.826 GHz - 9,561,365,042 instructions # 2.25 insn per cycle - 1.506086006 seconds time elapsed +TOTAL : 1.457101 sec + 4,229,207,978 cycles # 2.896 GHz + 9,561,222,824 instructions # 2.26 insn per cycle + 1.461220413 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.044745e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.045315e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.045315e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.119963e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.120507e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.120507e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.310362 sec - 3,714,842,589 cycles # 2.828 GHz - 8,485,417,237 instructions # 2.28 insn per cycle - 1.314439582 seconds time elapsed +TOTAL : 1.286739 sec + 3,714,427,423 cycles # 2.879 GHz + 8,485,272,385 instructions # 2.28 insn per cycle + 1.290826596 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.650927e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.651448e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.651448e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.600399e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.600911e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.600911e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.452786 sec - 2,695,403,304 cycles # 1.852 GHz - 4,273,125,151 instructions # 1.59 insn per cycle - 1.456779010 seconds time elapsed +TOTAL : 1.474924 sec + 2,695,875,361 cycles # 1.824 GHz + 4,273,169,567 instructions # 1.59 insn per cycle + 1.479057981 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index f73b319e4d..28081b2160 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-08_21:55:03 +DATE: 2023-11-09_18:13:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.070004e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.071005e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.071005e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.064318e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065254e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.065254e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.374046 sec - 8,061,206,235 cycles # 2.993 GHz - 17,860,181,288 instructions # 2.22 insn per cycle - 2.750065172 seconds time elapsed +TOTAL : 2.361712 sec + 8,164,385,199 cycles # 3.041 GHz + 16,942,565,052 instructions # 2.08 insn per cycle + 2.743176660 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.226901e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.259810e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.259810e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.190361e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.223459e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.223459e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.996223 sec - 12,903,615,719 cycles # 2.989 GHz - 27,064,646,353 instructions # 2.10 insn per cycle - 4.375939404 seconds time elapsed +TOTAL : 3.988233 sec + 13,123,079,634 cycles # 3.036 GHz + 28,841,455,416 instructions # 2.20 insn per cycle + 4.378479494 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.320809e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.321082e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.321082e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.307342e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.307565e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.307565e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.351015 sec - 18,895,432,596 cycles # 2.975 GHz - 53,920,363,469 instructions # 2.85 insn per cycle - 6.355030283 seconds time elapsed +TOTAL : 6.364568 sec + 18,927,213,544 cycles # 2.973 GHz + 53,918,164,087 instructions # 2.85 insn per cycle + 6.368577598 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.632581e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.632679e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.632679e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.666025e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.666114e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.666114e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.239771 sec - 9,805,010,159 cycles # 3.023 GHz - 27,094,031,310 instructions # 2.76 insn per cycle - 3.243901475 seconds time elapsed +TOTAL : 3.173683 sec + 9,797,609,023 cycles # 3.084 GHz + 27,093,782,808 instructions # 2.77 insn per cycle + 3.177702749 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96441) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.542776e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.543249e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.543249e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.255906e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.256265e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.256265e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.496243 sec - 4,233,173,830 cycles # 2.823 GHz - 9,562,510,318 instructions # 2.26 insn per cycle - 1.500255263 seconds time elapsed +TOTAL : 1.626420 sec + 4,592,212,308 cycles # 2.818 GHz + 9,562,781,549 instructions # 2.08 insn per cycle + 1.630448393 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.008828e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.009454e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.009454e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.133405e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.134023e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.134023e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.322886 sec - 3,744,251,192 cycles # 2.823 GHz - 8,486,441,130 instructions # 2.27 insn per cycle - 1.326937869 seconds time elapsed +TOTAL : 1.281959 sec + 3,704,600,058 cycles # 2.882 GHz + 8,486,385,133 instructions # 2.29 insn per cycle + 1.285885098 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.594601e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.595186e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.595186e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.663239e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.663889e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.663889e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.474018 sec - 2,696,155,761 cycles # 1.825 GHz - 4,274,155,931 instructions # 1.59 insn per cycle - 1.478064357 seconds time elapsed +TOTAL : 1.446962 sec + 2,696,700,654 cycles # 1.860 GHz + 4,274,559,971 instructions # 1.59 insn per cycle + 1.451147700 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 7a2b2c0da9..4570a77a9f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-08_21:28:55 +DATE: 2023-11-09_17:50:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.069743e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.070108e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.070239e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.067332e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.067722e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.067853e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.423143 sec - 8,082,723,641 cycles # 2.933 GHz - 18,147,438,278 instructions # 2.25 insn per cycle - 2.812330272 seconds time elapsed +TOTAL : 2.421722 sec + 8,395,936,189 cycles # 3.053 GHz + 18,623,375,460 instructions # 2.22 insn per cycle + 2.807666336 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.271955e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.273887e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.274124e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.274592e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.276551e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.276737e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.988843 sec - 13,001,327,656 cycles # 3.014 GHz - 27,551,753,777 instructions # 2.12 insn per cycle - 4.370037996 seconds time elapsed +TOTAL : 3.997048 sec + 13,290,560,155 cycles # 3.077 GHz + 29,230,575,077 instructions # 2.20 insn per cycle + 4.378333342 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.093188e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.093423e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.093423e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.641666e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.641939e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.641939e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.521355 sec - 18,798,207,330 cycles # 2.882 GHz - 53,926,908,182 instructions # 2.87 insn per cycle - 6.525452544 seconds time elapsed +TOTAL : 6.117499 sec + 18,785,945,280 cycles # 3.070 GHz + 53,927,524,861 instructions # 2.87 insn per cycle + 6.121375903 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.629486e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.629575e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.629575e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.649159e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.649256e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.649256e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.245853 sec - 9,848,079,716 cycles # 3.031 GHz - 27,090,265,030 instructions # 2.75 insn per cycle - 3.250037477 seconds time elapsed +TOTAL : 3.206425 sec + 9,787,082,067 cycles # 3.050 GHz + 27,089,817,225 instructions # 2.77 insn per cycle + 3.210577008 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96284) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.490286e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.490752e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.490752e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.558533e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.558987e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.558987e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.517008 sec - 4,257,648,545 cycles # 2.800 GHz - 9,561,344,255 instructions # 2.25 insn per cycle - 1.521285854 seconds time elapsed +TOTAL : 1.489258 sec + 4,261,284,391 cycles # 2.855 GHz + 9,561,306,757 instructions # 2.24 insn per cycle + 1.493274617 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.021344e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.021901e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.021901e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.116449e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.116994e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.116994e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.319133 sec - 3,701,318,743 cycles # 2.798 GHz - 8,485,189,781 instructions # 2.29 insn per cycle - 1.323286884 seconds time elapsed +TOTAL : 1.287600 sec + 3,697,517,464 cycles # 2.864 GHz + 8,485,532,294 instructions # 2.29 insn per cycle + 1.291548783 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.378941e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.379448e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.379448e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.666755e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.667279e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.667279e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.566088 sec - 2,698,066,709 cycles # 1.719 GHz - 4,276,879,461 instructions # 1.59 insn per cycle - 1.570153625 seconds time elapsed +TOTAL : 1.444368 sec + 2,694,896,725 cycles # 1.862 GHz + 4,276,159,790 instructions # 1.59 insn per cycle + 1.448419547 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index f4e838f103..4a0d02936a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-08_21:29:59 +DATE: 2023-11-09_17:51:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.755384e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.756376e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.756775e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.745896e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.746749e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.746990e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.659165 sec - 5,717,115,891 cycles # 2.955 GHz - 12,190,075,892 instructions # 2.13 insn per cycle - 1.991284959 seconds time elapsed +TOTAL : 1.657612 sec + 5,852,337,885 cycles # 3.029 GHz + 12,128,434,322 instructions # 2.07 insn per cycle + 1.989363075 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.328819e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.329492e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.329584e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.334998e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.335676e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.335767e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.928671 sec - 6,641,955,934 cycles # 3.003 GHz - 14,330,947,638 instructions # 2.16 insn per cycle - 2.270510678 seconds time elapsed +TOTAL : 1.921239 sec + 6,689,269,410 cycles # 3.045 GHz + 13,766,829,986 instructions # 2.06 insn per cycle + 2.253627777 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.903818e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.904090e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.904090e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.077848e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.078128e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.078128e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.935151 sec - 17,988,960,616 cycles # 3.029 GHz - 53,590,161,611 instructions # 2.98 insn per cycle - 5.939109392 seconds time elapsed +TOTAL : 5.821316 sec + 17,888,760,787 cycles # 3.072 GHz + 53,591,267,283 instructions # 3.00 insn per cycle + 5.825272234 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.520103e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.520628e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.520628e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.576360e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.576807e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.576807e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.505890 sec - 4,563,568,647 cycles # 3.024 GHz - 13,762,453,321 instructions # 3.02 insn per cycle - 1.509910484 seconds time elapsed +TOTAL : 1.480982 sec + 4,560,162,627 cycles # 3.072 GHz + 13,762,313,674 instructions # 3.02 insn per cycle + 1.485020552 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.038019e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.039763e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.039763e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.154943e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.156669e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.156669e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.756034 sec - 2,141,156,270 cycles # 2.820 GHz - 4,816,859,984 instructions # 2.25 insn per cycle - 0.760083736 seconds time elapsed +TOTAL : 0.743454 sec + 2,138,545,582 cycles # 2.865 GHz + 4,816,682,793 instructions # 2.25 insn per cycle + 0.747370846 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.079503e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.081743e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.081743e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.228374e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.230533e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.230533e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.658883 sec - 1,871,387,054 cycles # 2.825 GHz - 4,273,792,692 instructions # 2.28 insn per cycle - 0.663026186 seconds time elapsed +TOTAL : 0.646748 sec + 1,869,005,080 cycles # 2.875 GHz + 4,273,904,960 instructions # 2.29 insn per cycle + 0.650625419 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.037980e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.040224e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.040224e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.373581e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.376135e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.376135e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.756823 sec - 1,355,166,582 cycles # 1.782 GHz - 2,158,764,056 instructions # 1.59 insn per cycle - 0.760952708 seconds time elapsed +TOTAL : 0.721971 sec + 1,354,973,724 cycles # 1.868 GHz + 2,158,504,507 instructions # 1.59 insn per cycle + 0.726042839 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2878) (512y: 49) (512z:79298) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 6fa929f5b1..b3edd3819c 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-08_21:56:06 +DATE: 2023-11-09_18:14:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.804869e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.806749e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.806749e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.797007e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.798750e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.798750e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.602285 sec - 5,612,741,884 cycles # 2.994 GHz - 11,823,721,041 instructions # 2.11 insn per cycle - 1.932057655 seconds time elapsed +TOTAL : 1.595659 sec + 5,717,240,119 cycles # 3.061 GHz + 12,288,497,969 instructions # 2.15 insn per cycle + 1.924944467 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.321250e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.334433e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.334433e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.290056e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.302765e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.302765e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.875073 sec - 6,423,111,015 cycles # 2.987 GHz - 14,218,262,182 instructions # 2.21 insn per cycle - 2.206850504 seconds time elapsed +TOTAL : 1.886551 sec + 6,639,132,324 cycles # 3.056 GHz + 14,322,788,387 instructions # 2.16 insn per cycle + 2.229781396 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.905231e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.905509e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.905509e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.171261e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.171565e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.171565e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.938687 sec - 17,836,764,476 cycles # 3.002 GHz - 53,590,153,759 instructions # 3.00 insn per cycle - 5.942639449 seconds time elapsed +TOTAL : 5.764943 sec + 17,824,241,728 cycles # 3.090 GHz + 53,589,840,001 instructions # 3.01 insn per cycle + 5.768783827 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.489420e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.489830e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.489830e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.577193e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.577612e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.577612e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.517580 sec - 4,611,683,817 cycles # 3.032 GHz - 13,763,345,896 instructions # 2.98 insn per cycle - 1.521625428 seconds time elapsed +TOTAL : 1.481390 sec + 4,567,533,848 cycles # 3.077 GHz + 13,763,213,169 instructions # 3.01 insn per cycle + 1.485335177 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.247085e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.248950e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.248950e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.234763e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.236470e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.236470e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.733666 sec - 2,134,815,435 cycles # 2.897 GHz - 4,817,815,542 instructions # 2.26 insn per cycle - 0.737580401 seconds time elapsed +TOTAL : 0.735214 sec + 2,134,795,694 cycles # 2.891 GHz + 4,817,744,368 instructions # 2.26 insn per cycle + 0.739133829 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.255023e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.257521e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.257521e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.254949e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.257396e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.257396e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.644323 sec - 1,868,915,722 cycles # 2.886 GHz - 4,274,871,857 instructions # 2.29 insn per cycle - 0.648325497 seconds time elapsed +TOTAL : 0.644560 sec + 1,871,614,525 cycles # 2.889 GHz + 4,274,807,727 instructions # 2.28 insn per cycle + 0.648424122 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.514603e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.516833e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.516833e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.456942e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.459224e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.459224e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.708018 sec - 1,353,648,095 cycles # 1.903 GHz - 2,159,618,866 instructions # 1.60 insn per cycle - 0.711901071 seconds time elapsed +TOTAL : 0.714093 sec + 1,353,332,363 cycles # 1.886 GHz + 2,159,539,680 instructions # 1.60 insn per cycle + 0.718064585 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2878) (512y: 49) (512z:79298) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 2b69abf3e0..0346c64d8e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-08_21:30:46 +DATE: 2023-11-09_17:51:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.751553e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.752429e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.752778e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.750539e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.751383e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.751707e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.662264 sec - 5,791,514,994 cycles # 2.989 GHz - 11,290,505,064 instructions # 1.95 insn per cycle - 1.994487544 seconds time elapsed +TOTAL : 1.659496 sec + 5,776,495,417 cycles # 2.991 GHz + 11,901,437,818 instructions # 2.06 insn per cycle + 2.001980183 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.318654e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.319320e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.319463e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.353072e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.353765e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.353865e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.936834 sec - 6,513,518,428 cycles # 2.942 GHz - 13,310,876,477 instructions # 2.04 insn per cycle - 2.270995377 seconds time elapsed +TOTAL : 1.912117 sec + 6,490,117,914 cycles # 2.968 GHz + 14,058,143,997 instructions # 2.17 insn per cycle + 2.245070466 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.877357e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.877629e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.877629e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.137878e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.138152e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.138152e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.953085 sec - 17,926,444,710 cycles # 3.010 GHz - 53,580,674,845 instructions # 2.99 insn per cycle - 5.957045253 seconds time elapsed +TOTAL : 5.784705 sec + 17,870,079,189 cycles # 3.088 GHz + 53,579,576,519 instructions # 3.00 insn per cycle + 5.788683996 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:20206) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.538806e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.539230e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.539230e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.609484e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.609917e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.609917e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.497134 sec - 4,549,359,025 cycles # 3.032 GHz - 13,755,898,061 instructions # 3.02 insn per cycle - 1.501295301 seconds time elapsed +TOTAL : 1.467915 sec + 4,547,996,475 cycles # 3.091 GHz + 13,755,684,665 instructions # 3.02 insn per cycle + 1.471804589 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.000854e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.002553e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.002553e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.135956e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.137601e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.137601e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.759453 sec - 2,151,217,111 cycles # 2.820 GHz - 4,818,966,673 instructions # 2.24 insn per cycle - 0.763529614 seconds time elapsed +TOTAL : 0.744886 sec + 2,148,725,562 cycles # 2.872 GHz + 4,818,942,438 instructions # 2.24 insn per cycle + 0.748866334 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.076028e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.078137e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.078137e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.165432e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.167702e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.167702e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.658855 sec - 1,875,464,841 cycles # 2.832 GHz - 4,275,819,002 instructions # 2.28 insn per cycle - 0.662852680 seconds time elapsed +TOTAL : 0.651347 sec + 1,877,062,772 cycles # 2.867 GHz + 4,276,072,949 instructions # 2.28 insn per cycle + 0.655395180 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.283691e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.286276e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.286276e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.338677e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.341123e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.341123e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.730286 sec - 1,357,956,935 cycles # 1.851 GHz - 2,164,994,730 instructions # 1.59 insn per cycle - 0.734341079 seconds time elapsed +TOTAL : 0.724588 sec + 1,360,263,123 cycles # 1.868 GHz + 2,164,996,305 instructions # 1.59 insn per cycle + 0.728742359 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3475) (512y: 34) (512z:79492) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index c2c8a96928..8c7934b526 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-08_21:31:33 +DATE: 2023-11-09_17:52:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.686778e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.687273e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.687409e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.693982e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.694475e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.694605e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.171824 sec - 7,456,169,166 cycles # 2.995 GHz - 14,898,137,129 instructions # 2.00 insn per cycle - 2.549362993 seconds time elapsed +TOTAL : 2.169924 sec + 7,570,631,130 cycles # 3.042 GHz + 15,729,510,401 instructions # 2.08 insn per cycle + 2.547214982 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.112892e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.113171e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.113203e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.111663e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111941e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111967e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.401203 sec - 11,249,483,891 cycles # 3.009 GHz - 24,262,391,957 instructions # 2.16 insn per cycle - 3.794357278 seconds time elapsed +TOTAL : 3.399776 sec + 11,464,618,476 cycles # 3.079 GHz + 23,776,601,911 instructions # 2.07 insn per cycle + 3.779394913 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.772311e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.772526e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.772526e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.884667e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.884874e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.884874e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.810863 sec - 19,135,542,784 cycles # 2.808 GHz - 54,153,577,866 instructions # 2.83 insn per cycle - 6.814854998 seconds time elapsed +TOTAL : 6.698742 sec + 19,113,024,695 cycles # 2.852 GHz + 54,153,033,540 instructions # 2.83 insn per cycle + 6.702658032 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32066) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.589475e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.589562e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.589562e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.621402e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.621488e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.621488e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.327738 sec - 9,417,973,850 cycles # 2.827 GHz - 26,159,432,180 instructions # 2.78 insn per cycle - 3.331899471 seconds time elapsed +TOTAL : 3.261482 sec + 9,398,350,643 cycles # 2.879 GHz + 26,158,977,284 instructions # 2.78 insn per cycle + 3.265504352 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96005) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.728829e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.729288e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.729288e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.791341e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.791883e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.791883e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.420979 sec - 4,041,656,459 cycles # 2.838 GHz - 9,227,906,681 instructions # 2.28 insn per cycle - 1.425059392 seconds time elapsed +TOTAL : 1.398109 sec + 4,039,627,179 cycles # 2.883 GHz + 9,228,162,054 instructions # 2.28 insn per cycle + 1.402192827 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.219686e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.220314e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.220314e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.351031e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.351641e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.351641e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.256653 sec - 3,545,597,499 cycles # 2.814 GHz - 8,175,250,543 instructions # 2.31 insn per cycle - 1.260805357 seconds time elapsed +TOTAL : 1.218443 sec + 3,518,124,342 cycles # 2.879 GHz + 8,175,077,517 instructions # 2.32 insn per cycle + 1.222409560 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.660023e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.660558e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.660558e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.765628e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.766216e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.766216e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.447622 sec - 2,657,673,224 cycles # 1.832 GHz - 4,154,915,823 instructions # 1.56 insn per cycle - 1.451764331 seconds time elapsed +TOTAL : 1.407689 sec + 2,655,252,329 cycles # 1.882 GHz + 4,154,811,941 instructions # 1.56 insn per cycle + 1.411617738 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2045) (512y: 93) (512z:78760) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 485a0059f2..b26dd71707 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-08_21:32:35 +DATE: 2023-11-09_17:53:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.688491e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.689012e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.689176e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.674330e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.674838e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.674969e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.168620 sec - 7,451,542,055 cycles # 2.994 GHz - 15,551,253,703 instructions # 2.09 insn per cycle - 2.545633518 seconds time elapsed +TOTAL : 2.174009 sec + 7,611,935,314 cycles # 3.054 GHz + 16,836,441,609 instructions # 2.21 insn per cycle + 2.551658489 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.107863e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.108135e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.108171e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.107370e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107637e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107663e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.405108 sec - 11,192,783,578 cycles # 3.001 GHz - 25,734,796,379 instructions # 2.30 insn per cycle - 3.786804275 seconds time elapsed +TOTAL : 3.413929 sec + 11,386,114,072 cycles # 3.048 GHz + 23,902,448,329 instructions # 2.10 insn per cycle + 3.794282526 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.066104e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.066369e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.066369e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.931164e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.931386e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.931386e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.548059 sec - 19,079,779,477 cycles # 2.913 GHz - 54,153,651,610 instructions # 2.84 insn per cycle - 6.552064899 seconds time elapsed +TOTAL : 6.662052 sec + 19,079,234,145 cycles # 2.863 GHz + 54,153,851,240 instructions # 2.84 insn per cycle + 6.666006074 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32243) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.589149e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.589238e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.589238e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.620269e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.620358e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.620358e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.327579 sec - 9,382,040,636 cycles # 2.817 GHz - 26,078,619,591 instructions # 2.78 insn per cycle - 3.331633706 seconds time elapsed +TOTAL : 3.263602 sec + 9,383,434,712 cycles # 2.872 GHz + 26,078,178,648 instructions # 2.78 insn per cycle + 3.267785109 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:95899) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.662193e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.662639e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.662639e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.732412e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.732940e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.732940e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.447113 sec - 4,073,138,574 cycles # 2.808 GHz - 9,213,586,675 instructions # 2.26 insn per cycle - 1.451209760 seconds time elapsed +TOTAL : 1.420295 sec + 4,071,120,210 cycles # 2.859 GHz + 9,213,520,884 instructions # 2.26 insn per cycle + 1.424453149 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.194379e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.195039e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.195039e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.308670e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.309271e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.309271e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.264023 sec - 3,548,672,085 cycles # 2.800 GHz - 8,168,128,611 instructions # 2.30 insn per cycle - 1.268138683 seconds time elapsed +TOTAL : 1.231097 sec + 3,538,361,762 cycles # 2.867 GHz + 8,168,060,632 instructions # 2.31 insn per cycle + 1.234995598 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.707082e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.707666e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.707666e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.830037e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.830636e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.830636e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.430570 sec - 2,620,935,291 cycles # 1.830 GHz - 4,154,056,327 instructions # 1.58 insn per cycle - 1.434770233 seconds time elapsed +TOTAL : 1.385915 sec + 2,618,303,188 cycles # 1.885 GHz + 4,153,502,106 instructions # 1.59 insn per cycle + 1.389952232 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1492) (512y: 175) (512z:78776) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 45ec48d9b4..6d792821e6 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-08_21:26:06 +DATE: 2023-11-09_17:47:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.850720e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.319691e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.646421e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.838115e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.336717e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.669956e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.445414 sec - 1,963,940,666 cycles # 2.941 GHz - 2,761,951,187 instructions # 1.41 insn per cycle - 0.725454441 seconds time elapsed +TOTAL : 0.441585 sec + 1,966,591,447 cycles # 2.991 GHz + 2,767,621,879 instructions # 1.41 insn per cycle + 0.715631755 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.571453e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.132541e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.489040e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.614381e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.150528e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.499874e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.525989 sec - 2,266,225,819 cycles # 2.950 GHz - 3,255,459,976 instructions # 1.44 insn per cycle - 0.825689166 seconds time elapsed +TOTAL : 0.519980 sec + 2,272,719,542 cycles # 3.015 GHz + 3,282,015,462 instructions # 1.44 insn per cycle + 0.810626224 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.074272e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096702e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.096702e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.097912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.120487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.120487e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.547835 sec - 4,705,088,880 cycles # 3.034 GHz - 13,467,070,551 instructions # 2.86 insn per cycle - 1.551905661 seconds time elapsed +TOTAL : 1.514129 sec + 4,699,091,915 cycles # 3.096 GHz + 13,466,947,436 instructions # 2.87 insn per cycle + 1.518294228 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.836387e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.906822e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.906822e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.983607e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.058142e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.058142e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.914850 sec - 2,629,820,703 cycles # 2.863 GHz - 7,555,643,977 instructions # 2.87 insn per cycle - 0.919312372 seconds time elapsed +TOTAL : 0.847498 sec + 2,625,908,011 cycles # 3.086 GHz + 7,555,492,469 instructions # 2.88 insn per cycle + 0.851823974 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3095) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.179916e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.388522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.388522e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.394636e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.619511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.619511e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.538121 sec - 1,483,909,982 cycles # 2.739 GHz - 3,122,112,991 instructions # 2.10 insn per cycle - 0.542506000 seconds time elapsed +TOTAL : 0.504120 sec + 1,476,957,330 cycles # 2.909 GHz + 3,122,047,526 instructions # 2.11 insn per cycle + 0.508259108 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.492769e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.748148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.748148e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.754841e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.026481e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.026481e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.492302 sec - 1,352,205,323 cycles # 2.727 GHz - 2,983,986,621 instructions # 2.21 insn per cycle - 0.496759795 seconds time elapsed +TOTAL : 0.457617 sec + 1,342,416,487 cycles # 2.911 GHz + 2,984,161,058 instructions # 2.22 insn per cycle + 0.461673437 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.316160e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.426685e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.426685e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.547509e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.672958e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.672958e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.732612 sec - 1,330,714,647 cycles # 1.807 GHz - 1,956,053,126 instructions # 1.47 insn per cycle - 0.737097876 seconds time elapsed +TOTAL : 0.666989 sec + 1,325,861,856 cycles # 1.977 GHz + 1,955,811,920 instructions # 1.48 insn per cycle + 0.671229633 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 9573fdc8ac..8337df6649 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-08_21:53:21 +DATE: 2023-11-09_18:11:37 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.674751e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.241786e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.241786e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.580013e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.253753e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.253753e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.472075 sec - 2,011,630,446 cycles # 2.946 GHz - 2,977,593,506 instructions # 1.48 insn per cycle - 0.740354864 seconds time elapsed +TOTAL : 0.470514 sec + 2,029,905,705 cycles # 2.983 GHz + 3,022,396,069 instructions # 1.49 insn per cycle + 0.739050820 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.306214e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.374405e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.374405e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.291351e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.372563e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.372563e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.746007 sec - 2,930,819,015 cycles # 2.951 GHz - 4,513,689,699 instructions # 1.54 insn per cycle - 1.051041659 seconds time elapsed +TOTAL : 0.742777 sec + 2,970,951,255 cycles # 2.999 GHz + 4,514,637,368 instructions # 1.52 insn per cycle + 1.047584901 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.067462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.089978e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.089978e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.084622e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107537e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107537e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.563342 sec - 4,743,647,659 cycles # 3.027 GHz - 13,474,115,002 instructions # 2.84 insn per cycle - 1.567732700 seconds time elapsed +TOTAL : 1.538614 sec + 4,724,111,132 cycles # 3.063 GHz + 13,474,132,709 instructions # 2.85 insn per cycle + 1.542829058 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.931899e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.004806e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.004806e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.968452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.042732e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.042732e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.876421 sec - 2,657,928,129 cycles # 3.020 GHz - 7,605,320,089 instructions # 2.86 insn per cycle - 0.880831982 seconds time elapsed +TOTAL : 0.860148 sec + 2,657,657,312 cycles # 3.076 GHz + 7,605,024,054 instructions # 2.86 insn per cycle + 0.864557816 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3095) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.284426e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.500691e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.500691e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.339093e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.562110e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.562110e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.528369 sec - 1,515,520,073 cycles # 2.846 GHz - 3,173,010,189 instructions # 2.09 insn per cycle - 0.533003329 seconds time elapsed +TOTAL : 0.520524 sec + 1,514,451,185 cycles # 2.892 GHz + 3,172,765,595 instructions # 2.09 insn per cycle + 0.524939185 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.626971e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.890157e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.890157e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.708754e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.978270e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.978270e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.480865 sec - 1,378,241,594 cycles # 2.844 GHz - 3,034,725,088 instructions # 2.20 insn per cycle - 0.485339539 seconds time elapsed +TOTAL : 0.469539 sec + 1,371,933,121 cycles # 2.899 GHz + 3,033,200,949 instructions # 2.21 insn per cycle + 0.473789571 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.445545e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.566504e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.566504e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.533145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.657118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.657118e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.701984 sec - 1,365,857,372 cycles # 1.935 GHz - 1,995,672,274 instructions # 1.46 insn per cycle - 0.706431315 seconds time elapsed +TOTAL : 0.676505 sec + 1,357,238,089 cycles # 1.995 GHz + 1,995,412,477 instructions # 1.47 insn per cycle + 0.680880338 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index a982c1092c..2ec6b9dc47 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-08_21:26:24 +DATE: 2023-11-09_17:47:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.808432e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.231946e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.554075e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.819082e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.206686e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.526015e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.446409 sec - 1,914,723,819 cycles # 2.858 GHz - 2,720,530,830 instructions # 1.42 insn per cycle - 0.726781250 seconds time elapsed +TOTAL : 0.443165 sec + 1,961,379,003 cycles # 2.989 GHz + 2,781,357,072 instructions # 1.42 insn per cycle + 0.713330689 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.542836e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.030986e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.390683e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.580414e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.034117e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.374431e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.529359 sec - 2,191,645,691 cycles # 2.864 GHz - 3,157,433,372 instructions # 1.44 insn per cycle - 0.823018193 seconds time elapsed +TOTAL : 0.527713 sec + 2,200,622,669 cycles # 2.860 GHz + 3,134,287,672 instructions # 1.42 insn per cycle + 0.826392715 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.036305e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.058434e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.058434e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.033805e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.055304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055304e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.603791 sec - 4,708,850,584 cycles # 2.929 GHz - 13,461,227,684 instructions # 2.86 insn per cycle - 1.607981971 seconds time elapsed +TOTAL : 1.607267 sec + 4,703,491,098 cycles # 2.920 GHz + 13,461,246,606 instructions # 2.86 insn per cycle + 1.611368977 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.854678e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.928501e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.928501e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.985735e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.061359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.061359e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.906299 sec - 2,638,123,420 cycles # 2.899 GHz - 7,554,662,347 instructions # 2.86 insn per cycle - 0.910729092 seconds time elapsed +TOTAL : 0.845910 sec + 2,624,687,455 cycles # 3.090 GHz + 7,554,687,341 instructions # 2.88 insn per cycle + 0.850163593 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.120658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.331862e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.331862e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.383208e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.600735e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.600735e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.548282 sec - 1,490,121,110 cycles # 2.699 GHz - 3,120,571,278 instructions # 2.09 insn per cycle - 0.552853693 seconds time elapsed +TOTAL : 0.505300 sec + 1,477,429,478 cycles # 2.904 GHz + 3,120,730,266 instructions # 2.11 insn per cycle + 0.509369657 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.460892e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.716719e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.716719e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.736623e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.003084e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.003084e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.496477 sec - 1,349,987,385 cycles # 2.699 GHz - 2,981,775,320 instructions # 2.21 insn per cycle - 0.500801099 seconds time elapsed +TOTAL : 0.460033 sec + 1,340,907,328 cycles # 2.892 GHz + 2,981,159,149 instructions # 2.22 insn per cycle + 0.464174349 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.283025e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.395178e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.395178e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.537070e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.658764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.658764e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.742923 sec - 1,336,539,142 cycles # 1.791 GHz - 1,954,402,399 instructions # 1.46 insn per cycle - 0.747445158 seconds time elapsed +TOTAL : 0.669277 sec + 1,326,031,179 cycles # 1.971 GHz + 1,954,098,862 instructions # 1.47 insn per cycle + 0.673467594 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 0870ac1612..25d66c7041 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-08_21:26:42 +DATE: 2023-11-09_17:47:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.731772e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.218499e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.346344e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.746320e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.236957e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.360917e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.443647 sec - 1,860,995,101 cycles # 2.829 GHz - 2,577,640,181 instructions # 1.39 insn per cycle - 0.715495532 seconds time elapsed +TOTAL : 0.440483 sec + 1,942,018,247 cycles # 2.976 GHz + 2,734,614,888 instructions # 1.41 insn per cycle + 0.710582968 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.975472e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.830515e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.954464e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.010716e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.836484e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.960610e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.479313 sec - 1,996,286,635 cycles # 2.831 GHz - 2,879,932,210 instructions # 1.44 insn per cycle - 0.762534142 seconds time elapsed +TOTAL : 0.476376 sec + 2,093,828,578 cycles # 2.973 GHz + 2,983,215,115 instructions # 1.42 insn per cycle + 0.763925577 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.068560e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092999e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.092999e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.150183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175828e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.175828e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.555116 sec - 4,461,765,661 cycles # 2.863 GHz - 13,052,553,175 instructions # 2.93 insn per cycle - 1.559192669 seconds time elapsed +TOTAL : 1.444504 sec + 4,454,034,181 cycles # 3.077 GHz + 13,052,158,813 instructions # 2.93 insn per cycle + 1.448436066 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.882925e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.070631e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.070631e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.075306e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.270472e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.270472e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.589518 sec - 1,706,750,598 cycles # 2.878 GHz - 4,515,023,670 instructions # 2.65 insn per cycle - 0.593859816 seconds time elapsed +TOTAL : 0.552218 sec + 1,700,873,014 cycles # 3.061 GHz + 4,515,081,496 instructions # 2.65 insn per cycle + 0.556201186 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3601) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.765834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.493743e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.493743e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.031649e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.790374e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.790374e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.305319 sec - 853,645,854 cycles # 2.763 GHz - 1,898,477,314 instructions # 2.22 insn per cycle - 0.309705869 seconds time elapsed +TOTAL : 0.291602 sec + 850,563,357 cycles # 2.883 GHz + 1,898,510,633 instructions # 2.23 insn per cycle + 0.295657443 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.141881e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.979826e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.979826e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.014318e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.832565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.832565e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287752 sec - 800,772,449 cycles # 2.748 GHz - 1,821,769,219 instructions # 2.28 insn per cycle - 0.292040341 seconds time elapsed +TOTAL : 0.293482 sec + 802,625,962 cycles # 2.700 GHz + 1,821,591,063 instructions # 2.27 insn per cycle + 0.297764671 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe @@ -194,9 +194,9 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 29,120,008 cycles # 2.647 GHz - 41,681,258 instructions # 1.43 insn per cycle - 0.011379573 seconds time elapsed + 29,732,895 cycles # 2.697 GHz + 41,670,508 instructions # 1.40 insn per cycle + 0.011409242 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1969) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 0597ee22a3..687daa906c 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-08_21:53:38 +DATE: 2023-11-09_18:11:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.639706e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.257120e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.257120e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.747186e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.237510e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.237510e+07 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.449974 sec - 1,947,595,961 cycles # 2.942 GHz - 2,880,549,080 instructions # 1.48 insn per cycle - 0.719459148 seconds time elapsed +TOTAL : 0.451024 sec + 1,971,298,564 cycles # 2.989 GHz + 2,921,186,990 instructions # 1.48 insn per cycle + 0.718171404 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.168463e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.812098e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.812098e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.154719e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.829239e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.829239e+07 ) sec^-1 MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.616345 sec - 2,486,978,198 cycles # 2.935 GHz - 3,790,315,811 instructions # 1.52 insn per cycle - 0.904030203 seconds time elapsed +TOTAL : 0.620701 sec + 2,514,914,307 cycles # 2.959 GHz + 3,812,117,615 instructions # 1.52 insn per cycle + 0.908673198 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.130080e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155980e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.155980e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.130761e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.156131e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.473693 sec - 4,471,154,333 cycles # 3.027 GHz - 13,056,458,670 instructions # 2.92 insn per cycle - 1.477812555 seconds time elapsed +TOTAL : 1.472797 sec + 4,472,979,155 cycles # 3.030 GHz + 13,056,761,338 instructions # 2.92 insn per cycle + 1.477050712 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.025004e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.219703e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.219703e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.077738e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.274919e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.274919e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.566150 sec - 1,723,667,712 cycles # 3.025 GHz - 4,563,297,886 instructions # 2.65 insn per cycle - 0.570436411 seconds time elapsed +TOTAL : 0.555619 sec + 1,722,866,665 cycles # 3.081 GHz + 4,563,322,469 instructions # 2.65 insn per cycle + 0.559797755 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3601) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.858362e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.587732e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.587732e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.956375e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.689121e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.689121e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.303895 sec - 871,800,602 cycles # 2.835 GHz - 1,935,423,519 instructions # 2.22 insn per cycle - 0.308064640 seconds time elapsed +TOTAL : 0.298686 sec + 869,037,023 cycles # 2.875 GHz + 1,935,544,426 instructions # 2.23 insn per cycle + 0.302811266 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.340209e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.201036e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.201036e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.465666e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.344453e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.344453e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.282456 sec - 818,779,897 cycles # 2.862 GHz - 1,858,681,592 instructions # 2.27 insn per cycle - 0.286757422 seconds time elapsed +TOTAL : 0.276910 sec + 817,448,595 cycles # 2.915 GHz + 1,858,610,780 instructions # 2.27 insn per cycle + 0.280974833 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe @@ -211,9 +211,9 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) - 37,403,629 cycles # 2.691 GHz - 50,469,890 instructions # 1.35 insn per cycle - 0.014372629 seconds time elapsed + 37,531,426 cycles # 2.805 GHz + 50,366,354 instructions # 1.34 insn per cycle + 0.013813903 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1969) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 1f88f16cf0..8bc404b84b 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-08_21:26:59 +DATE: 2023-11-09_17:48:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.710525e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.199979e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.326321e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.693711e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.215042e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.339602e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.440656 sec - 1,914,804,492 cycles # 2.925 GHz - 2,653,138,253 instructions # 1.39 insn per cycle - 0.711749358 seconds time elapsed +TOTAL : 0.438778 sec + 1,941,964,603 cycles # 2.979 GHz + 2,729,283,404 instructions # 1.41 insn per cycle + 0.709465120 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.891594e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.784645e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904203e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.971564e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799531e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.917090e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.476610 sec - 2,083,856,457 cycles # 2.940 GHz - 2,965,032,628 instructions # 1.42 insn per cycle - 0.765879589 seconds time elapsed +TOTAL : 0.470244 sec + 2,084,172,928 cycles # 3.008 GHz + 2,971,888,877 instructions # 1.43 insn per cycle + 0.750810002 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.129719e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155174e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.155174e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.156981e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183336e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183336e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.470593 sec - 4,452,128,732 cycles # 3.020 GHz - 13,033,118,765 instructions # 2.93 insn per cycle - 1.474660881 seconds time elapsed +TOTAL : 1.435690 sec + 4,451,626,158 cycles # 3.094 GHz + 13,032,987,489 instructions # 2.93 insn per cycle + 1.439578191 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.040157e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.234537e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.234537e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.129722e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.328624e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.328624e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.558718 sec - 1,691,566,910 cycles # 3.008 GHz - 4,511,110,866 instructions # 2.67 insn per cycle - 0.562886591 seconds time elapsed +TOTAL : 0.542754 sec + 1,689,058,698 cycles # 3.092 GHz + 4,510,968,389 instructions # 2.67 insn per cycle + 0.546880720 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.942184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.690459e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.690459e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.059640e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.837369e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.837369e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.296099 sec - 853,486,904 cycles # 2.847 GHz - 1,895,390,282 instructions # 2.22 insn per cycle - 0.300311325 seconds time elapsed +TOTAL : 0.290425 sec + 852,449,044 cycles # 2.901 GHz + 1,895,470,717 instructions # 2.22 insn per cycle + 0.294595816 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.374489e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.242458e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.242458e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.503379e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.376998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.376998e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.277008 sec - 800,885,707 cycles # 2.855 GHz - 1,817,516,411 instructions # 2.27 insn per cycle - 0.281135474 seconds time elapsed +TOTAL : 0.271227 sec + 799,263,402 cycles # 2.909 GHz + 1,817,410,136 instructions # 2.27 insn per cycle + 0.275264605 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe @@ -194,9 +194,9 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions - 28,754,068 cycles # 2.640 GHz - 40,955,371 instructions # 1.42 insn per cycle - 0.011419598 seconds time elapsed + 28,811,890 cycles # 2.702 GHz + 40,903,960 instructions # 1.42 insn per cycle + 0.011044926 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1932) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index d5ef07e007..eab7ec279c 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-08_21:27:16 +DATE: 2023-11-09_17:48:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.821562e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.300473e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.628825e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.897435e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.394394e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.726655e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.447954 sec - 1,932,564,435 cycles # 2.921 GHz - 2,743,560,511 instructions # 1.42 insn per cycle - 0.719731147 seconds time elapsed +TOTAL : 0.442647 sec + 2,004,900,013 cycles # 3.007 GHz + 2,826,895,466 instructions # 1.41 insn per cycle + 0.724412660 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.575286e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.143575e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.499311e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.620708e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.161875e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.511766e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.523239 sec - 2,243,203,062 cycles # 2.949 GHz - 3,244,551,518 instructions # 1.45 insn per cycle - 0.818196957 seconds time elapsed +TOTAL : 0.520790 sec + 2,257,400,704 cycles # 2.997 GHz + 3,259,917,218 instructions # 1.44 insn per cycle + 0.810908697 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.069116e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.091100e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.091100e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.088540e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.110933e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110933e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.554535 sec - 4,725,018,841 cycles # 3.035 GHz - 13,469,753,614 instructions # 2.85 insn per cycle - 1.558693291 seconds time elapsed +TOTAL : 1.526966 sec + 4,723,154,452 cycles # 3.087 GHz + 13,469,602,667 instructions # 2.85 insn per cycle + 1.531097432 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.970313e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.046371e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.046371e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.988494e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.063440e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.063440e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.853333 sec - 2,596,868,107 cycles # 3.030 GHz - 7,388,624,187 instructions # 2.85 insn per cycle - 0.857591565 seconds time elapsed +TOTAL : 0.845345 sec + 2,599,329,855 cycles # 3.062 GHz + 7,388,612,618 instructions # 2.84 insn per cycle + 0.849529924 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.332912e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.554037e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.554037e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.404332e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.629825e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.629825e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.513899 sec - 1,466,763,063 cycles # 2.835 GHz - 3,057,876,447 instructions # 2.08 insn per cycle - 0.518107133 seconds time elapsed +TOTAL : 0.502979 sec + 1,466,711,057 cycles # 2.896 GHz + 3,057,623,965 instructions # 2.08 insn per cycle + 0.507143043 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.777029e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.058907e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.058907e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.803609e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.085245e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.085245e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.455720 sec - 1,306,910,741 cycles # 2.845 GHz - 2,932,818,419 instructions # 2.24 insn per cycle - 0.460076062 seconds time elapsed +TOTAL : 0.452713 sec + 1,309,685,857 cycles # 2.871 GHz + 2,932,566,248 instructions # 2.24 insn per cycle + 0.456835979 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.391166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.500870e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.500870e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.397391e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.510097e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.510097e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.709219 sec - 1,365,455,058 cycles # 1.916 GHz - 1,971,797,344 instructions # 1.44 insn per cycle - 0.713482957 seconds time elapsed +TOTAL : 0.707515 sec + 1,366,670,273 cycles # 1.922 GHz + 1,971,774,412 instructions # 1.44 insn per cycle + 0.711692701 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 6e69f82aee..804124a528 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-08_21:27:34 +DATE: 2023-11-09_17:48:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.812345e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.208019e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.520614e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.811798e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.176696e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.495530e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.446229 sec - 1,955,610,259 cycles # 2.936 GHz - 2,744,647,203 instructions # 1.40 insn per cycle - 0.725146174 seconds time elapsed +TOTAL : 0.443833 sec + 2,007,951,396 cycles # 2.999 GHz + 2,822,905,809 instructions # 1.41 insn per cycle + 0.728453943 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.529337e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.985996e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.326246e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.587196e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.041060e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.377539e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.522218 sec - 2,238,618,942 cycles # 2.943 GHz - 3,202,939,408 instructions # 1.43 insn per cycle - 0.817604471 seconds time elapsed +TOTAL : 0.523091 sec + 2,298,379,472 cycles # 2.986 GHz + 3,299,691,245 instructions # 1.44 insn per cycle + 0.827230276 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.065182e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.087517e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.087517e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.081127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.103599e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.103599e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.560291 sec - 4,729,799,308 cycles # 3.025 GHz - 13,455,876,389 instructions # 2.84 insn per cycle - 1.564515481 seconds time elapsed +TOTAL : 1.537190 sec + 4,726,723,623 cycles # 3.068 GHz + 13,455,766,194 instructions # 2.85 insn per cycle + 1.541247326 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.946971e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.020229e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.020229e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.984806e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.061569e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.061569e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.863034 sec - 2,601,868,480 cycles # 3.003 GHz - 7,392,543,085 instructions # 2.84 insn per cycle - 0.867199240 seconds time elapsed +TOTAL : 0.846326 sec + 2,602,293,302 cycles # 3.065 GHz + 7,392,635,608 instructions # 2.84 insn per cycle + 0.850454133 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.323539e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.538773e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.538773e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.380134e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.599128e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.599128e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.514660 sec - 1,469,850,553 cycles # 2.835 GHz - 3,058,079,146 instructions # 2.08 insn per cycle - 0.519050232 seconds time elapsed +TOTAL : 0.506085 sec + 1,466,467,612 cycles # 2.876 GHz + 3,058,106,145 instructions # 2.09 insn per cycle + 0.510457197 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.767525e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.049329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.049329e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.778195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.059768e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.059768e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.456721 sec - 1,309,025,943 cycles # 2.843 GHz - 2,933,534,120 instructions # 2.24 insn per cycle - 0.460967936 seconds time elapsed +TOTAL : 0.455384 sec + 1,311,774,111 cycles # 2.858 GHz + 2,933,399,487 instructions # 2.24 insn per cycle + 0.459674797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.405794e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.516831e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.516831e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.385780e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.497799e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.497799e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.704869 sec - 1,364,487,579 cycles # 1.926 GHz - 1,971,713,310 instructions # 1.45 insn per cycle - 0.709028391 seconds time elapsed +TOTAL : 0.711136 sec + 1,370,131,308 cycles # 1.917 GHz + 1,971,581,787 instructions # 1.44 insn per cycle + 0.715633425 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe From b89bf4cd355304d673506ce11aff4dbc3c4e04c4 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 10 Nov 2023 06:38:26 +0100 Subject: [PATCH 14/14] [gpucpp] ** COMPLETE GPUCPP** rerun 18 tmad tests after the upgrade to 3.5.2, no change in functionality or performance STARTED AT Thu Nov 9 06:24:51 PM CET 2023 ENDED AT Thu Nov 9 10:43:11 PM CET 2023 Status=0 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt --- .../log_eemumu_mad_d_inl0_hrd0.txt | 138 ++++++++--------- .../log_eemumu_mad_f_inl0_hrd0.txt | 134 ++++++++--------- .../log_eemumu_mad_m_inl0_hrd0.txt | 132 ++++++++--------- .../log_ggtt_mad_d_inl0_hrd0.txt | 132 ++++++++--------- .../log_ggtt_mad_f_inl0_hrd0.txt | 136 ++++++++--------- .../log_ggtt_mad_m_inl0_hrd0.txt | 136 ++++++++--------- .../log_ggttg_mad_d_inl0_hrd0.txt | 134 ++++++++--------- .../log_ggttg_mad_f_inl0_hrd0.txt | 136 ++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 140 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 132 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 136 ++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 138 ++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 134 ++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 136 ++++++++--------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 138 ++++++++--------- .../log_gqttq_mad_d_inl0_hrd0.txt | 138 ++++++++--------- .../log_gqttq_mad_f_inl0_hrd0.txt | 138 ++++++++--------- .../log_gqttq_mad_m_inl0_hrd0.txt | 136 ++++++++--------- 18 files changed, 1222 insertions(+), 1222 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 383178f656..16028d3846 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:08:17 +DATE: 2023-11-09_18:26:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6257s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6178s - [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6383s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6302s + [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1766s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1680s - [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.60E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1882s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1797s + [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.66E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4156s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3301s - [COUNTERS] Fortran MEs ( 1 ) : 0.0855s for 90112 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4280s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3388s + [COUNTERS] Fortran MEs ( 1 ) : 0.0892s for 90112 events => throughput is 1.01E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1878s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1815s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0063s for 8192 events => throughput is 1.30E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1893s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1830s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.28E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4131s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3430s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0701s for 90112 events => throughput is 1.29E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4144s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3434s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0710s for 90112 events => throughput is 1.27E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.227734e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.246747e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.242066e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.254814e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1813s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1774s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1854s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1814s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.06E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3861s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3422s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0439s for 90112 events => throughput is 2.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3872s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3433s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0440s for 90112 events => throughput is 2.05E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.002470e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.008841e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.006601e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.041604e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1828s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1796s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.57E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1821s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1790s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.66E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3731s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3402s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0329s for 90112 events => throughput is 2.74E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3764s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3431s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0333s for 90112 events => throughput is 2.71E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.620678e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.648221e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.819190e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.737599e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1815s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1784s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.71E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1828s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1800s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.94E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3723s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3407s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0316s for 90112 events => throughput is 2.85E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3409s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0317s for 90112 events => throughput is 2.84E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.820321e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.822405e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.842053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.840653e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1819s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1785s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1827s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1792s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.34E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3824s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3420s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0404s for 90112 events => throughput is 2.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3845s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3458s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0387s for 90112 events => throughput is 2.33E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.075096e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.213684e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.166357e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.288308e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5934s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5929s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.63E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5941s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5936s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.56E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7863s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7814s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0050s for 90112 events => throughput is 1.81E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7643s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7594s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.85E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.141020e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.122558e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.873271e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.902108e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.990853e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.029032e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.361218e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.427964e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.939860e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.990174e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.944408e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.966232e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.975323e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.011562e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.124184e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.099952e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 4b3b0b9b07..bed8731e5c 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:08:35 +DATE: 2023-11-09_18:26:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6276s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6195s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6375s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6295s + [COUNTERS] Fortran MEs ( 1 ) : 0.0080s for 8192 events => throughput is 1.03E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1778s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1697s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1779s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1700s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4139s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3290s - [COUNTERS] Fortran MEs ( 1 ) : 0.0849s for 90112 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4168s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s + [COUNTERS] Fortran MEs ( 1 ) : 0.0858s for 90112 events => throughput is 1.05E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166087172673] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1874s + [COUNTERS] PROGRAM TOTAL : 0.1876s [COUNTERS] Fortran Overhead ( 0 ) : 0.1813s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.33E+06 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0063s for 8192 events => throughput is 1.31E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501907796603360E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4142s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3454s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0687s for 90112 events => throughput is 1.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4132s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3439s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0693s for 90112 events => throughput is 1.30E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.261327e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.290954e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.287607e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269110e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1786s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1761s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1799s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1773s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.11E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3651s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3380s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0270s for 90112 events => throughput is 3.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3696s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3425s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0271s for 90112 events => throughput is 3.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.137840e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.211958e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.298087e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331194e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1864s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1841s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1846s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1823s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.66E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3868s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3609s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0260s for 90112 events => throughput is 3.47E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3689s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3438s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0251s for 90112 events => throughput is 3.59E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.442542e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.583243e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.634986e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.664821e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1856s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1832s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1881s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1858s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.64E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3685s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3440s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0245s for 90112 events => throughput is 3.68E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4013s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3744s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0269s for 90112 events => throughput is 3.35E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.588607e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.708142e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.872180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.716354e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166440400542] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1887s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1865s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.71E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1876s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1854s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.69E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501908978565555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3693s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3442s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0251s for 90112 events => throughput is 3.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3791s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3532s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0259s for 90112 events => throughput is 3.47E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.372399e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.388042e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.586770e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.799218e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5951s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.69E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5957s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5952s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.72E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7616s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7570s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 90112 events => throughput is 1.96E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7596s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7551s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 90112 events => throughput is 1.97E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.577355e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.613080e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.822297e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.898284e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.937359e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.543811e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.046785e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.026187e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.102347e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.468953e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.203659e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.241582e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.365649e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.812787e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.422918e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.411277e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 9a947a36a5..8b8c11aaf5 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -17,13 +17,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:08:51 +DATE: 2023-11-09_18:26:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6267s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6186s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6293s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6211s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1781s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1703s - [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1828s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1742s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.52E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4162s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s - [COUNTERS] Fortran MEs ( 1 ) : 0.0853s for 90112 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4185s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3320s + [COUNTERS] Fortran MEs ( 1 ) : 0.0865s for 90112 events => throughput is 1.04E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,8 +134,8 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1898s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1832s + [COUNTERS] PROGRAM TOTAL : 0.1883s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1817s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4158s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3438s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0720s for 90112 events => throughput is 1.25E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4177s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3452s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0725s for 90112 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.204267e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.192297e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.208788e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.206668e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1829s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1790s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0038s for 8192 events => throughput is 2.13E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1831s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1792s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.12E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3821s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3399s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 90112 events => throughput is 2.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3847s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3426s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0421s for 90112 events => throughput is 2.14E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.047978e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.077610e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.116427e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.127798e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1871s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1841s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.76E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1845s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1815s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.72E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3733s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3393s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0340s for 90112 events => throughput is 2.65E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3749s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3413s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0336s for 90112 events => throughput is 2.68E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.642107e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.567900e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.787956e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.786544e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1776s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.89E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1822s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1793s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.82E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,8 +395,8 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3708s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3395s + [COUNTERS] PROGRAM TOTAL : 0.3728s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3415s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0313s for 90112 events => throughput is 2.88E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.821887e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.787216e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.874115e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.802177e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1829s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1796s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.43E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1827s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1792s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3842s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3465s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0377s for 90112 events => throughput is 2.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3808s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3437s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0371s for 90112 events => throughput is 2.43E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.237740e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.306669e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.366800e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.302969e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5935s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5930s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5952s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5947s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.67E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7654s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0050s for 90112 events => throughput is 1.82E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7615s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7567s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.88E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.007927e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.094813e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.918411e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912678e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.018629e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.000800e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.348012e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.334730e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.994146e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018486e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.917104e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.914438e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.983673e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.024074e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.123333e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.129214e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 3e628018af..824a8e25d5 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -4,8 +4,8 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:09:08 +DATE: 2023-11-09_18:26:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3517s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3111s - [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3548s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3140s + [COUNTERS] Fortran MEs ( 1 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3086s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2681s - [COUNTERS] Fortran MEs ( 1 ) : 0.0405s for 8192 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3094s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2683s + [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6533s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2091s - [COUNTERS] Fortran MEs ( 1 ) : 0.4442s for 90112 events => throughput is 2.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6956s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2429s + [COUNTERS] Fortran MEs ( 1 ) : 0.4528s for 90112 events => throughput is 1.99E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3456s + [COUNTERS] PROGRAM TOTAL : 0.3445s [COUNTERS] Fortran Overhead ( 0 ) : 0.3078s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0379s for 8192 events => throughput is 2.16E+05 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0367s for 8192 events => throughput is 2.23E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6716s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2645s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4071s for 90112 events => throughput is 2.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6787s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2659s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4128s for 90112 events => throughput is 2.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.224417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.206364e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.212367e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.211188e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3132s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2919s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3133s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2921s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0211s for 8192 events => throughput is 3.88E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4780s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2422s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2359s for 90112 events => throughput is 3.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4919s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2565s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2354s for 90112 events => throughput is 3.83E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.777989e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.806213e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.740213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.795645e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2955s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2981s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2850s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0131s for 8192 events => throughput is 6.23E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3915s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2460s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1455s for 90112 events => throughput is 6.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3832s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2385s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1448s for 90112 events => throughput is 6.22E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.030466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.053490e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.192047e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.106690e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2946s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2826s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0119s for 8192 events => throughput is 6.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2943s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2825s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0117s for 8192 events => throughput is 6.97E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3659s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2362s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1297s for 90112 events => throughput is 6.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3653s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2365s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1287s for 90112 events => throughput is 7.00E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.841360e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.704382e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.816529e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.799597e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3142s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0191s for 8192 events => throughput is 4.29E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3082s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2885s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0198s for 8192 events => throughput is 4.15E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4624s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2517s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2106s for 90112 events => throughput is 4.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6624s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4291s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2333s for 90112 events => throughput is 3.86E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.955720e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.938387e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.094472e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.929754e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.6940s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6935s + [COUNTERS] PROGRAM TOTAL : 0.6969s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6963s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7032s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6968s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6570s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6507s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0063s for 90112 events => throughput is 1.43E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.103744e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.071187e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.691695e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.692368e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.194593e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.183000e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.070229e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.074203e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.168601e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.195387e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.149757e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.150737e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.190999e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.203236e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.017633e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.040065e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 0321a276a0..6ff403b879 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -4,9 +4,9 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - -make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z @@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:09:34 +DATE: 2023-11-09_18:27:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3489s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3083s - [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3494s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3093s + [COUNTERS] Fortran MEs ( 1 ) : 0.0401s for 8192 events => throughput is 2.04E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3073s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2670s - [COUNTERS] Fortran MEs ( 1 ) : 0.0403s for 8192 events => throughput is 2.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2663s + [COUNTERS] Fortran MEs ( 1 ) : 0.0405s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6502s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2060s - [COUNTERS] Fortran MEs ( 1 ) : 0.4442s for 90112 events => throughput is 2.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6536s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2070s + [COUNTERS] Fortran MEs ( 1 ) : 0.4466s for 90112 events => throughput is 2.02E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690706767555099] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3425s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3079s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0345s for 8192 events => throughput is 2.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3397s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3049s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0348s for 8192 events => throughput is 2.35E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782605295497] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6631s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2770s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3861s for 90112 events => throughput is 2.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6398s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2589s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3809s for 90112 events => throughput is 2.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.342613e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.342865e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.319125e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.331036e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690702885183541] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3002s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2858s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0145s for 8192 events => throughput is 5.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2992s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2845s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.59E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223778858016772] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3973s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2359s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1614s for 90112 events => throughput is 5.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4772s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3090s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1682s for 90112 events => throughput is 5.36E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.270911e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.225442e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.359921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.299428e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2834s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2758s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3093s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3001s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0092s for 8192 events => throughput is 8.88E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3197s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2356s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0842s for 90112 events => throughput is 1.07E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3166s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2317s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0848s for 90112 events => throughput is 1.06E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.026437e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.025673e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.028771e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.017812e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,8 +362,8 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2894s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2821s + [COUNTERS] PROGRAM TOTAL : 0.2858s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2785s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3159s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2370s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0789s for 90112 events => throughput is 1.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3072s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2282s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0790s for 90112 events => throughput is 1.14E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.095999e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.097760e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.120004e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.119253e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690698914467276] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2909s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2810s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0098s for 8192 events => throughput is 8.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2807s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0099s for 8192 events => throughput is 8.25E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223780273983500] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4173s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2979s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1195s for 90112 events => throughput is 7.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3509s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2397s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1112s for 90112 events => throughput is 8.10E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.668644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.884299e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.548978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.701504e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690703397697980] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.6943s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6937s + [COUNTERS] PROGRAM TOTAL : 0.6960s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6955s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.51E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6513s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6459s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 90112 events => throughput is 1.67E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6624s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6571s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 90112 events => throughput is 1.68E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.266713e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.111635e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.234896e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.880409e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.830084e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.143607e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.762403e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.762374e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.776301e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.140173e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.872477e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.866583e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.374142e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.685718e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.426544e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.400545e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 8bacc65fe8..9b02995ca5 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,8 +1,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:09:59 +DATE: 2023-11-09_18:27:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3627s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3194s - [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3509s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3105s + [COUNTERS] Fortran MEs ( 1 ) : 0.0405s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3074s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2660s - [COUNTERS] Fortran MEs ( 1 ) : 0.0414s for 8192 events => throughput is 1.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3067s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2662s + [COUNTERS] Fortran MEs ( 1 ) : 0.0405s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6907s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2365s - [COUNTERS] Fortran MEs ( 1 ) : 0.4542s for 90112 events => throughput is 1.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6580s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2117s + [COUNTERS] Fortran MEs ( 1 ) : 0.4462s for 90112 events => throughput is 2.02E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3443s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3074s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 8192 events => throughput is 2.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3460s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0379s for 8192 events => throughput is 2.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6798s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2683s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4115s for 90112 events => throughput is 2.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6700s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2581s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4119s for 90112 events => throughput is 2.19E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.164831e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.182152e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.183670e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.183502e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3143s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0208s for 8192 events => throughput is 3.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3147s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2941s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0207s for 8192 events => throughput is 3.96E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4761s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2466s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2295s for 90112 events => throughput is 3.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4759s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2478s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2281s for 90112 events => throughput is 3.95E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.799865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.820026e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.756525e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.775419e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2976s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2844s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2965s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2837s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.38E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4201s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2736s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1464s for 90112 events => throughput is 6.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3920s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2488s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1432s for 90112 events => throughput is 6.29E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.181937e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.159361e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.243573e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.220899e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2977s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2865s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3061s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2938s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0123s for 8192 events => throughput is 6.65E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3670s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2408s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1262s for 90112 events => throughput is 7.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3693s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2423s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1270s for 90112 events => throughput is 7.10E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.933959e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.912537e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.064349e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.069074e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3083s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2895s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0188s for 8192 events => throughput is 4.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3327s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3109s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0218s for 8192 events => throughput is 3.75E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4519s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2484s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2036s for 90112 events => throughput is 4.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4629s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2551s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2078s for 90112 events => throughput is 4.34E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.266660e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.077933e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.117226e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.997576e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708266690699] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.6949s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6985s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6979s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.40E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6539s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6476s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0063s for 90112 events => throughput is 1.42E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6617s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6553s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.049281e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.060435e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.529307e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.608769e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.148817e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186491e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.053163e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059369e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.170472e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.182441e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.130394e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.136921e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.186789e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.174632e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.035076e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.949461e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 09e16e6057..241597d591 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:10:24 +DATE: 2023-11-09_18:28:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5436s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2280s - [COUNTERS] Fortran MEs ( 1 ) : 0.3156s for 8192 events => throughput is 2.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5556s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2379s + [COUNTERS] Fortran MEs ( 1 ) : 0.3178s for 8192 events => throughput is 2.58E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5326s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2186s - [COUNTERS] Fortran MEs ( 1 ) : 0.3141s for 8192 events => throughput is 2.61E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5351s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2203s + [COUNTERS] Fortran MEs ( 1 ) : 0.3148s for 8192 events => throughput is 2.60E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.9133s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4066s - [COUNTERS] Fortran MEs ( 1 ) : 3.5067s for 90112 events => throughput is 2.57E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.8579s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3886s + [COUNTERS] Fortran MEs ( 1 ) : 3.4692s for 90112 events => throughput is 2.60E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470791E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8544s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5319s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3225s for 8192 events => throughput is 2.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8596s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5355s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3241s for 8192 events => throughput is 2.53E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.3255s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7008s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6247s for 90112 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.2563s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6842s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5721s for 90112 events => throughput is 2.52E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.590377e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.570949e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.610150e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.596498e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5624s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3861s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1763s for 8192 events => throughput is 4.65E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5542s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3858s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1684s for 8192 events => throughput is 4.87E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.3972s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5470s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8502s for 90112 events => throughput is 4.87E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.5019s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5803s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.9216s for 90112 events => throughput is 4.69E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.010592e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.985717e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.958333e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.959096e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3818s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2982s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 8192 events => throughput is 9.79E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3840s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3011s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0829s for 8192 events => throughput is 9.88E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3684s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4497s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9187s for 90112 events => throughput is 9.81E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.3753s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4512s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9241s for 90112 events => throughput is 9.75E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.953639e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.005162e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.002866e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000723e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3650s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2906s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0744s for 8192 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3672s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0754s for 8192 events => throughput is 1.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.2634s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4412s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8222s for 90112 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2690s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4430s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8260s for 90112 events => throughput is 1.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.117525e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111268e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.126876e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.117996e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4269s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3231s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1039s for 8192 events => throughput is 7.89E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4279s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3235s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1044s for 8192 events => throughput is 7.85E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6060s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4689s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1371s for 90112 events => throughput is 7.92E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6406s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4855s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1551s for 90112 events => throughput is 7.80E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.896705e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.832306e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.740238e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.896180e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6527s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6472s + [COUNTERS] PROGRAM TOTAL : 0.6558s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6503s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8560s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8329s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0231s for 90112 events => throughput is 3.90E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.8300s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8072s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624902e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.613028e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.902263e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.229609e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.850642e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.871226e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.238047e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.236452e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.868590e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.869896e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.248755e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.247810e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.862444e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.851703e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.745100e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.745705e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 1a98ebc0f5..9b1af7b411 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:11:05 +DATE: 2023-11-09_18:28:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5362s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2200s - [COUNTERS] Fortran MEs ( 1 ) : 0.3162s for 8192 events => throughput is 2.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5377s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2218s + [COUNTERS] Fortran MEs ( 1 ) : 0.3159s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5340s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2185s - [COUNTERS] Fortran MEs ( 1 ) : 0.3154s for 8192 events => throughput is 2.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5364s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2203s + [COUNTERS] Fortran MEs ( 1 ) : 0.3161s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.8590s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3887s - [COUNTERS] Fortran MEs ( 1 ) : 3.4703s for 90112 events => throughput is 2.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.9162s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3985s + [COUNTERS] Fortran MEs ( 1 ) : 3.5176s for 90112 events => throughput is 2.56E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196349765248158E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8380s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5255s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3125s for 8192 events => throughput is 2.62E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8412s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5250s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3162s for 8192 events => throughput is 2.59E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310860767768514E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.1166s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6696s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.4470s for 90112 events => throughput is 2.61E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.1769s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6882s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.4887s for 90112 events => throughput is 2.58E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.677117e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.661457e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.693750e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.666467e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196334183509370E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4030s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3096s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0934s for 8192 events => throughput is 8.77E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4080s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3132s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0948s for 8192 events => throughput is 8.64E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310847547651041E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.4739s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4457s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0282s for 90112 events => throughput is 8.76E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5043s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4696s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0347s for 90112 events => throughput is 8.71E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.839523e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.800531e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.853955e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.815957e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,8 +286,8 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3025s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2591s + [COUNTERS] PROGRAM TOTAL : 0.3077s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2643s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8724s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4009s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4715s for 90112 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8998s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4206s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4793s for 90112 events => throughput is 1.88E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.919418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.823286e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.922480e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.826868e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.2944s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2562s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0382s for 8192 events => throughput is 2.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2983s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2592s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0391s for 8192 events => throughput is 2.10E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8215s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3936s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4279s for 90112 events => throughput is 2.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8383s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4080s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4303s for 90112 events => throughput is 2.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.114883e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.101947e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.107711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.126133e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196344079460428E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3218s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2710s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0508s for 8192 events => throughput is 1.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3220s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2717s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0503s for 8192 events => throughput is 1.63E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310857804286998E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9668s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4146s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5522s for 90112 events => throughput is 1.63E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9888s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4251s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5637s for 90112 events => throughput is 1.60E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.619298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.589248e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.625264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.587181e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196349366365994E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6443s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6435s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 9.56E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6498s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6490s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.66E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310864949473968E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.7852s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7757s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 90112 events => throughput is 9.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.8143s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8048s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 90112 events => throughput is 9.51E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.275339e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.303788e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.852966e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.857184e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.672301e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.727610e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.329588e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.358085e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.661199e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.712514e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.474053e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.447022e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.511679e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.573590e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.616407e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.621450e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index b41396f75b..e102a98f20 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none - +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:11:42 +DATE: 2023-11-09_18:29:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5361s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2201s - [COUNTERS] Fortran MEs ( 1 ) : 0.3160s for 8192 events => throughput is 2.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5406s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2214s + [COUNTERS] Fortran MEs ( 1 ) : 0.3192s for 8192 events => throughput is 2.57E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5352s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2185s - [COUNTERS] Fortran MEs ( 1 ) : 0.3167s for 8192 events => throughput is 2.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5369s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2199s + [COUNTERS] Fortran MEs ( 1 ) : 0.3170s for 8192 events => throughput is 2.58E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.8603s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3884s - [COUNTERS] Fortran MEs ( 1 ) : 3.4719s for 90112 events => throughput is 2.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.8531s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3845s + [COUNTERS] Fortran MEs ( 1 ) : 3.4687s for 90112 events => throughput is 2.60E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358763382007E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8721s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5420s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3301s for 8192 events => throughput is 2.48E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8764s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5433s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3331s for 8192 events => throughput is 2.46E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.2894s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6845s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6049s for 90112 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3597s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7144s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6453s for 90112 events => throughput is 2.47E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.562016e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.553245e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.546299e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.536593e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358804670396E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5435s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3795s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1640s for 8192 events => throughput is 4.99E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5484s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3827s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1657s for 8192 events => throughput is 4.94E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.3591s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5386s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8206s for 90112 events => throughput is 4.95E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.3712s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5426s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8286s for 90112 events => throughput is 4.93E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.765208e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.047917e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.784106e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.047714e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4043s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3156s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0888s for 8192 events => throughput is 9.23E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3884s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3047s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0838s for 8192 events => throughput is 9.78E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3898s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4601s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9297s for 90112 events => throughput is 9.69E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.3827s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4554s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9273s for 90112 events => throughput is 9.72E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.002689e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.985245e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.001815e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.974556e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3655s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2923s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0732s for 8192 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3689s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0738s for 8192 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.2429s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4365s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8064s for 90112 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2643s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4493s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8151s for 90112 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.134514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.067840e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.146843e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.069793e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358757578441E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4312s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3234s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1077s for 8192 events => throughput is 7.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4597s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3406s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1191s for 8192 events => throughput is 6.88E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872803699391E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6602s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4811s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1791s for 90112 events => throughput is 7.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7582s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5206s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2377s for 90112 events => throughput is 7.28E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.628154e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.675272e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.726777e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.626790e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358102981245E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6526s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6472s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6588s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6533s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8190s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7961s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0229s for 90112 events => throughput is 3.93E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.8293s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8065s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.619555e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635720e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.404025e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.120274e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.847979e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.835173e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.233328e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.231986e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.825056e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.818919e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.244373e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.242590e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.833245e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.805414e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.724277e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.724480e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index e6041006eb..408d8d380a 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -16,14 +16,14 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:12:23 +DATE: 2023-11-09_18:30:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.3823s + [COUNTERS] PROGRAM TOTAL : 4.3928s [COUNTERS] Fortran Overhead ( 0 ) : 0.2780s - [COUNTERS] Fortran MEs ( 1 ) : 4.1043s for 8192 events => throughput is 2.00E+03 events/s + [COUNTERS] Fortran MEs ( 1 ) : 4.1147s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.3581s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2728s - [COUNTERS] Fortran MEs ( 1 ) : 4.0853s for 8192 events => throughput is 2.01E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.3846s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2715s + [COUNTERS] Fortran MEs ( 1 ) : 4.1131s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 47.0624s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8747s - [COUNTERS] Fortran MEs ( 1 ) : 45.1877s for 90112 events => throughput is 1.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.4210s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8888s + [COUNTERS] Fortran MEs ( 1 ) : 45.5321s for 90112 events => throughput is 1.98E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.6032s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3774s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2258s for 8192 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.6565s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4044s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2521s for 8192 events => throughput is 1.93E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 52.5656s - [COUNTERS] Fortran Overhead ( 0 ) : 5.9752s - [COUNTERS] CudaCpp MEs ( 2 ) : 46.5903s for 90112 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 52.9600s + [COUNTERS] Fortran Overhead ( 0 ) : 6.0482s + [COUNTERS] CudaCpp MEs ( 2 ) : 46.9118s for 90112 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.002618e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.992604e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.000666e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.989276e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.6983s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4517s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2466s for 8192 events => throughput is 3.65E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7458s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4719s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2739s for 8192 events => throughput is 3.60E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 29.0395s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1529s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.8866s for 90112 events => throughput is 3.62E+03 events/s + [COUNTERS] PROGRAM TOTAL : 29.7086s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1354s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.5732s for 90112 events => throughput is 3.52E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.775162e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.697279e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.752647e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.704506e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.2090s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2291s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9799s for 8192 events => throughput is 8.36E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.2161s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2329s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9832s for 8192 events => throughput is 8.33E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.6019s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8333s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.7686s for 90112 events => throughput is 8.37E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.6675s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8433s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.8242s for 90112 events => throughput is 8.33E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.622945e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.632389e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.637406e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.597678e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.9647s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1065s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 8192 events => throughput is 9.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.9728s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1137s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8591s for 8192 events => throughput is 9.54E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.2465s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7171s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.5294s for 90112 events => throughput is 9.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.1508s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7103s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4405s for 90112 events => throughput is 9.55E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.867536e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.863291e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.834174e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.840135e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.4062s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3349s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0712s for 8192 events => throughput is 7.65E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5697s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4500s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1197s for 8192 events => throughput is 7.32E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 14.7127s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9424s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.7703s for 90112 events => throughput is 7.66E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.0835s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0464s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.0370s for 90112 events => throughput is 7.49E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.671946e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.677485e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.485706e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.683279e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8073s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7752s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8101s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7773s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0327s for 8192 events => throughput is 2.50E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.7243s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3746s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3497s for 90112 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7514s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3963s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3551s for 90112 events => throughput is 2.54E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.290435e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.285714e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.518069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.505353e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.109074e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109677e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.162766e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.147684e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.119359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.113597e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.170946e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.164951e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.114486e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.106343e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.433160e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.432331e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index a18920ba3f..f4a809f68b 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:16:35 +DATE: 2023-11-09_18:34:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.4492s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2726s - [COUNTERS] Fortran MEs ( 1 ) : 4.1766s for 8192 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.3944s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2768s + [COUNTERS] Fortran MEs ( 1 ) : 4.1176s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.3607s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2703s - [COUNTERS] Fortran MEs ( 1 ) : 4.0903s for 8192 events => throughput is 2.00E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5146s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2747s + [COUNTERS] Fortran MEs ( 1 ) : 4.2399s for 8192 events => throughput is 1.93E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 47.0727s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8744s - [COUNTERS] Fortran MEs ( 1 ) : 45.1984s for 90112 events => throughput is 1.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.3456s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8962s + [COUNTERS] Fortran MEs ( 1 ) : 45.4494s for 90112 events => throughput is 1.98E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277396490802749E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.3702s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2240s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.1462s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.3558s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2546s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1013s for 8192 events => throughput is 2.00E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803774602344628E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 50.9666s - [COUNTERS] Fortran Overhead ( 0 ) : 5.8905s - [COUNTERS] CudaCpp MEs ( 2 ) : 45.0761s for 90112 events => throughput is 2.00E+03 events/s + [COUNTERS] PROGRAM TOTAL : 51.2827s + [COUNTERS] Fortran Overhead ( 0 ) : 5.9515s + [COUNTERS] CudaCpp MEs ( 2 ) : 45.3313s for 90112 events => throughput is 1.99E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.075529e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.068073e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.074082e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.068719e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277389126121586E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.5244s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3710s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1534s for 8192 events => throughput is 7.10E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4998s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3795s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1203s for 8192 events => throughput is 7.31E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803771887543366E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.2999s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0272s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.2727s for 90112 events => throughput is 7.34E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.4928s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0115s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.4813s for 90112 events => throughput is 7.22E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.487987e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.470531e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.461964e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.461238e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.2522s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7532s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4990s for 8192 events => throughput is 1.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.2540s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7572s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4968s for 8192 events => throughput is 1.65E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.8843s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3862s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4981s for 90112 events => throughput is 1.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.8987s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3780s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.5207s for 90112 events => throughput is 1.63E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.703770e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.671559e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.715659e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.684139e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.1254s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6948s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4306s for 8192 events => throughput is 1.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.1397s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7044s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4354s for 8192 events => throughput is 1.88E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.0325s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2899s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.7425s for 90112 events => throughput is 1.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.1254s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3176s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.8078s for 90112 events => throughput is 1.87E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.946675e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932083e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.957212e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934934e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277396394633404E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.3221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7944s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5278s for 8192 events => throughput is 1.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3342s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8031s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5312s for 8192 events => throughput is 1.54E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803777741065333E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 8.1973s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3930s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.8043s for 90112 events => throughput is 1.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.3073s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4189s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.8884s for 90112 events => throughput is 1.53E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.558982e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.547676e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.568288e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.546957e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277400478491260E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.7705s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7491s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7736s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7522s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0215s for 8192 events => throughput is 3.81E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.5805s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3447s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2358s for 90112 events => throughput is 3.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5981s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3628s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2353s for 90112 events => throughput is 3.83E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.598757e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.602414e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.937809e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.925045e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.495923e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.484752e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.725491e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.656642e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.498449e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.490786e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.660457e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.725267e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.473649e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.471712e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.522099e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.530964e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 05db57554d..9bed8b02d9 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:19:53 +DATE: 2023-11-09_18:37:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.3676s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2775s - [COUNTERS] Fortran MEs ( 1 ) : 4.0901s for 8192 events => throughput is 2.00E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.3681s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2752s + [COUNTERS] Fortran MEs ( 1 ) : 4.0929s for 8192 events => throughput is 2.00E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.4195s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2705s - [COUNTERS] Fortran MEs ( 1 ) : 4.1489s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.3422s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2703s + [COUNTERS] Fortran MEs ( 1 ) : 4.0719s for 8192 events => throughput is 2.01E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 47.1152s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8703s - [COUNTERS] Fortran MEs ( 1 ) : 45.2450s for 90112 events => throughput is 1.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.1722s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8864s + [COUNTERS] Fortran MEs ( 1 ) : 45.2857s for 90112 events => throughput is 1.99E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.7049s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4327s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2722s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.6914s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4356s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2558s for 8192 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725813026109E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 53.0960s - [COUNTERS] Fortran Overhead ( 0 ) : 6.0891s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.0069s for 90112 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 54.0099s + [COUNTERS] Fortran Overhead ( 0 ) : 6.0604s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.9495s for 90112 events => throughput is 1.88E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.971437e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.955214e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.965809e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.962469e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277430934464E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.7042s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4800s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2242s for 8192 events => throughput is 3.68E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7696s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4653s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3043s for 8192 events => throughput is 3.56E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725816246317E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 28.5105s - [COUNTERS] Fortran Overhead ( 0 ) : 4.0554s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.4551s for 90112 events => throughput is 3.68E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.7487s + [COUNTERS] Fortran Overhead ( 0 ) : 4.0795s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.6692s for 90112 events => throughput is 3.65E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.800834e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.767280e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.788503e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.771152e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.1858s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2226s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9633s for 8192 events => throughput is 8.50E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.1933s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2225s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9708s for 8192 events => throughput is 8.44E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.5514s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8252s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.7262s for 90112 events => throughput is 8.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.6387s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8343s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.8044s for 90112 events => throughput is 8.34E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.756273e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.765902e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.759413e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.708316e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.9510s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0980s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8530s for 8192 events => throughput is 9.60E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.9610s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1075s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8536s for 8192 events => throughput is 9.60E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.1748s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7107s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4641s for 90112 events => throughput is 9.52E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.2117s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7290s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4827s for 90112 events => throughput is 9.50E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.859146e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.837213e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.890303e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.813722e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.4412s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3447s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0965s for 8192 events => throughput is 7.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4206s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3396s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0809s for 8192 events => throughput is 7.58E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 14.7703s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9437s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.8266s for 90112 events => throughput is 7.62E+03 events/s + [COUNTERS] PROGRAM TOTAL : 14.8540s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9517s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9023s for 90112 events => throughput is 7.57E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.668015e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.664729e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.694387e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.661148e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277293084707E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8048s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7728s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7745s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0323s for 8192 events => throughput is 2.54E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.7246s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3746s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3499s for 90112 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7640s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4053s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3587s for 90112 events => throughput is 2.51E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.280245e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.297023e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.525176e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.536170e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.116522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.107408e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.157499e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.153471e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.119956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.118088e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.172287e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176343e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.122850e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.120562e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.440669e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.436751e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index b972c40fa5..635bc8aab0 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:25:30 +DATE: 2023-11-09_18:43:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 95.8408s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4545s - [COUNTERS] Fortran MEs ( 1 ) : 95.3863s for 8192 events => throughput is 8.59E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.1979s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4594s + [COUNTERS] Fortran MEs ( 1 ) : 95.7384s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 95.5040s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4512s - [COUNTERS] Fortran MEs ( 1 ) : 95.0528s for 8192 events => throughput is 8.62E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.1938s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4572s + [COUNTERS] Fortran MEs ( 1 ) : 95.7366s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1050.5151s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1583s - [COUNTERS] Fortran MEs ( 1 ) : 1046.3568s for 90112 events => throughput is 8.61E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1056.1191s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1851s + [COUNTERS] Fortran MEs ( 1 ) : 1051.9341s for 90112 events => throughput is 8.57E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 216.0448s - [COUNTERS] Fortran Overhead ( 0 ) : 99.5423s - [COUNTERS] CudaCpp MEs ( 2 ) : 116.5025s for 8192 events => throughput is 7.03E+01 events/s + [COUNTERS] PROGRAM TOTAL : 221.2522s + [COUNTERS] Fortran Overhead ( 0 ) : 101.5022s + [COUNTERS] CudaCpp MEs ( 2 ) : 119.7500s for 8192 events => throughput is 6.84E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813953E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1395.0826s - [COUNTERS] Fortran Overhead ( 0 ) : 101.4573s - [COUNTERS] CudaCpp MEs ( 2 ) : 1293.6254s for 90112 events => throughput is 6.97E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1409.0435s + [COUNTERS] Fortran Overhead ( 0 ) : 99.0565s + [COUNTERS] CudaCpp MEs ( 2 ) : 1309.9869s for 90112 events => throughput is 6.88E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.294341e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.535302e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.275454e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.232167e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 107.3938s - [COUNTERS] Fortran Overhead ( 0 ) : 49.4703s - [COUNTERS] CudaCpp MEs ( 2 ) : 57.9235s for 8192 events => throughput is 1.41E+02 events/s + [COUNTERS] PROGRAM TOTAL : 107.7463s + [COUNTERS] Fortran Overhead ( 0 ) : 49.5074s + [COUNTERS] CudaCpp MEs ( 2 ) : 58.2390s for 8192 events => throughput is 1.41E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 689.6088s - [COUNTERS] Fortran Overhead ( 0 ) : 53.6676s - [COUNTERS] CudaCpp MEs ( 2 ) : 635.9412s for 90112 events => throughput is 1.42E+02 events/s + [COUNTERS] PROGRAM TOTAL : 695.6110s + [COUNTERS] Fortran Overhead ( 0 ) : 53.4125s + [COUNTERS] CudaCpp MEs ( 2 ) : 642.1984s for 90112 events => throughput is 1.40E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.663387e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.667754e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.670748e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.672792e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 50.5726s - [COUNTERS] Fortran Overhead ( 0 ) : 23.0971s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.4754s for 8192 events => throughput is 2.98E+02 events/s + [COUNTERS] PROGRAM TOTAL : 50.7441s + [COUNTERS] Fortran Overhead ( 0 ) : 23.3520s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.3921s for 8192 events => throughput is 2.99E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 326.9697s - [COUNTERS] Fortran Overhead ( 0 ) : 26.6301s - [COUNTERS] CudaCpp MEs ( 2 ) : 300.3396s for 90112 events => throughput is 3.00E+02 events/s + [COUNTERS] PROGRAM TOTAL : 331.0298s + [COUNTERS] Fortran Overhead ( 0 ) : 27.1582s + [COUNTERS] CudaCpp MEs ( 2 ) : 303.8716s for 90112 events => throughput is 2.97E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.612820e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.602735e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.630261e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.607119e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 44.4764s - [COUNTERS] Fortran Overhead ( 0 ) : 20.3120s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.1644s for 8192 events => throughput is 3.39E+02 events/s + [COUNTERS] PROGRAM TOTAL : 44.2409s + [COUNTERS] Fortran Overhead ( 0 ) : 20.3557s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.8852s for 8192 events => throughput is 3.43E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 289.1902s - [COUNTERS] Fortran Overhead ( 0 ) : 23.9124s - [COUNTERS] CudaCpp MEs ( 2 ) : 265.2778s for 90112 events => throughput is 3.40E+02 events/s + [COUNTERS] PROGRAM TOTAL : 289.3981s + [COUNTERS] Fortran Overhead ( 0 ) : 23.9732s + [COUNTERS] CudaCpp MEs ( 2 ) : 265.4249s for 90112 events => throughput is 3.40E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.088132e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.111160e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.127446e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.141844e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 45.6965s - [COUNTERS] Fortran Overhead ( 0 ) : 22.1825s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.5139s for 8192 events => throughput is 3.48E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.6199s + [COUNTERS] Fortran Overhead ( 0 ) : 22.4059s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.2139s for 8192 events => throughput is 3.53E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 283.5251s - [COUNTERS] Fortran Overhead ( 0 ) : 25.9112s - [COUNTERS] CudaCpp MEs ( 2 ) : 257.6139s for 90112 events => throughput is 3.50E+02 events/s + [COUNTERS] PROGRAM TOTAL : 283.6130s + [COUNTERS] Fortran Overhead ( 0 ) : 26.2046s + [COUNTERS] CudaCpp MEs ( 2 ) : 257.4085s for 90112 events => throughput is 3.50E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.741805e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.763228e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.777930e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.741992e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435838E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 4.1875s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1069s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0806s for 8192 events => throughput is 7.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.1979s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1190s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0789s for 8192 events => throughput is 7.59E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 18.7118s - [COUNTERS] Fortran Overhead ( 0 ) : 6.8168s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.8950s for 90112 events => throughput is 7.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 18.6565s + [COUNTERS] Fortran Overhead ( 0 ) : 6.7674s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.8891s for 90112 events => throughput is 7.58E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.523661e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.527117e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.283120e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.256112e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.266218e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.240392e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.591927e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.568765e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.251570e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.279873e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.476794e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.441727e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.262349e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.268118e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.252080e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.240204e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 3ca211fa85..9a7b15ddba 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_23:51:54 +DATE: 2023-11-09_20:10:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 95.6648s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4545s - [COUNTERS] Fortran MEs ( 1 ) : 95.2103s for 8192 events => throughput is 8.60E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.6517s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4537s + [COUNTERS] Fortran MEs ( 1 ) : 95.1980s for 8192 events => throughput is 8.61E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 95.3879s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4521s - [COUNTERS] Fortran MEs ( 1 ) : 94.9358s for 8192 events => throughput is 8.63E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.5775s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4538s + [COUNTERS] Fortran MEs ( 1 ) : 95.1237s for 8192 events => throughput is 8.61E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1051.3512s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1998s - [COUNTERS] Fortran MEs ( 1 ) : 1047.1514s for 90112 events => throughput is 8.61E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1055.1274s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1731s + [COUNTERS] Fortran MEs ( 1 ) : 1050.9543s for 90112 events => throughput is 8.57E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694768344939596E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 195.2840s - [COUNTERS] Fortran Overhead ( 0 ) : 89.6572s - [COUNTERS] CudaCpp MEs ( 2 ) : 105.6269s for 8192 events => throughput is 7.76E+01 events/s + [COUNTERS] PROGRAM TOTAL : 198.8691s + [COUNTERS] Fortran Overhead ( 0 ) : 90.2534s + [COUNTERS] CudaCpp MEs ( 2 ) : 108.6157s for 8192 events => throughput is 7.54E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361436150871156E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1253.2021s - [COUNTERS] Fortran Overhead ( 0 ) : 93.4786s - [COUNTERS] CudaCpp MEs ( 2 ) : 1159.7235s for 90112 events => throughput is 7.77E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1275.3669s + [COUNTERS] Fortran Overhead ( 0 ) : 93.9491s + [COUNTERS] CudaCpp MEs ( 2 ) : 1181.4178s for 90112 events => throughput is 7.63E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.188520e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.083570e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.207566e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.167448e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694765850750953E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 48.9590s - [COUNTERS] Fortran Overhead ( 0 ) : 23.2330s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.7260s for 8192 events => throughput is 3.18E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.8398s + [COUNTERS] Fortran Overhead ( 0 ) : 23.4099s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.4299s for 8192 events => throughput is 3.10E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361430669586527E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 312.4727s - [COUNTERS] Fortran Overhead ( 0 ) : 26.8498s - [COUNTERS] CudaCpp MEs ( 2 ) : 285.6229s for 90112 events => throughput is 3.15E+02 events/s + [COUNTERS] PROGRAM TOTAL : 320.3836s + [COUNTERS] Fortran Overhead ( 0 ) : 26.9904s + [COUNTERS] CudaCpp MEs ( 2 ) : 293.3932s for 90112 events => throughput is 3.07E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.595667e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.524011e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.615224e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.562557e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 25.4046s - [COUNTERS] Fortran Overhead ( 0 ) : 11.8022s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.6023s for 8192 events => throughput is 6.02E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.3018s + [COUNTERS] Fortran Overhead ( 0 ) : 11.8221s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.4798s for 8192 events => throughput is 6.08E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 164.6743s - [COUNTERS] Fortran Overhead ( 0 ) : 15.5764s - [COUNTERS] CudaCpp MEs ( 2 ) : 149.0979s for 90112 events => throughput is 6.04E+02 events/s + [COUNTERS] PROGRAM TOTAL : 161.8530s + [COUNTERS] Fortran Overhead ( 0 ) : 15.4501s + [COUNTERS] CudaCpp MEs ( 2 ) : 146.4028s for 90112 events => throughput is 6.16E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.233727e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.213869e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.144603e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.163477e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 22.4388s - [COUNTERS] Fortran Overhead ( 0 ) : 10.5095s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9293s for 8192 events => throughput is 6.87E+02 events/s + [COUNTERS] PROGRAM TOTAL : 22.2497s + [COUNTERS] Fortran Overhead ( 0 ) : 10.3581s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.8916s for 8192 events => throughput is 6.89E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 145.9227s - [COUNTERS] Fortran Overhead ( 0 ) : 13.9719s - [COUNTERS] CudaCpp MEs ( 2 ) : 131.9508s for 90112 events => throughput is 6.83E+02 events/s + [COUNTERS] PROGRAM TOTAL : 144.5243s + [COUNTERS] Fortran Overhead ( 0 ) : 14.0601s + [COUNTERS] CudaCpp MEs ( 2 ) : 130.4642s for 90112 events => throughput is 6.91E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.277686e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.261245e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.316223e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.179572e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694767957195604E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 22.8899s - [COUNTERS] Fortran Overhead ( 0 ) : 11.3435s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.5464s for 8192 events => throughput is 7.09E+02 events/s + [COUNTERS] PROGRAM TOTAL : 22.8272s + [COUNTERS] Fortran Overhead ( 0 ) : 11.2607s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.5665s for 8192 events => throughput is 7.08E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361435956349820E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 142.7065s - [COUNTERS] Fortran Overhead ( 0 ) : 14.9424s - [COUNTERS] CudaCpp MEs ( 2 ) : 127.7641s for 90112 events => throughput is 7.05E+02 events/s + [COUNTERS] PROGRAM TOTAL : 143.3402s + [COUNTERS] Fortran Overhead ( 0 ) : 14.9961s + [COUNTERS] CudaCpp MEs ( 2 ) : 128.3441s for 90112 events => throughput is 7.02E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.537880e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.537594e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.497574e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.456699e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694770708195000E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 2.4801s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9879s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4922s for 8192 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.4571s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9676s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4895s for 8192 events => throughput is 1.67E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361443477565659E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 11.0377s - [COUNTERS] Fortran Overhead ( 0 ) : 5.5836s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4541s for 90112 events => throughput is 1.65E+04 events/s + [COUNTERS] PROGRAM TOTAL : 11.0626s + [COUNTERS] Fortran Overhead ( 0 ) : 5.6077s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4549s for 90112 events => throughput is 1.65E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.639292e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.640892e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.626171e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.619412e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.329585e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.340657e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.369301e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.426283e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.304460e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.326049e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.376586e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.360046e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.333260e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.341201e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.421151e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.441486e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 2729351c42..e947131942 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-09_00:57:00 +DATE: 2023-11-09_21:16:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 95.3917s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4564s - [COUNTERS] Fortran MEs ( 1 ) : 94.9352s for 8192 events => throughput is 8.63E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.6107s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4599s + [COUNTERS] Fortran MEs ( 1 ) : 95.1508s for 8192 events => throughput is 8.61E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 95.2404s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4495s - [COUNTERS] Fortran MEs ( 1 ) : 94.7909s for 8192 events => throughput is 8.64E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.5844s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4540s + [COUNTERS] Fortran MEs ( 1 ) : 95.1304s for 8192 events => throughput is 8.61E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1049.6483s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1482s - [COUNTERS] Fortran MEs ( 1 ) : 1045.5001s for 90112 events => throughput is 8.62E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1052.2893s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1570s + [COUNTERS] Fortran MEs ( 1 ) : 1048.1323s for 90112 events => throughput is 8.60E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101016896846E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 223.2377s - [COUNTERS] Fortran Overhead ( 0 ) : 102.8564s - [COUNTERS] CudaCpp MEs ( 2 ) : 120.3813s for 8192 events => throughput is 6.81E+01 events/s + [COUNTERS] PROGRAM TOTAL : 223.0748s + [COUNTERS] Fortran Overhead ( 0 ) : 103.3973s + [COUNTERS] CudaCpp MEs ( 2 ) : 119.6775s for 8192 events => throughput is 6.85E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1425.5713s - [COUNTERS] Fortran Overhead ( 0 ) : 106.5194s - [COUNTERS] CudaCpp MEs ( 2 ) : 1319.0519s for 90112 events => throughput is 6.83E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1425.4469s + [COUNTERS] Fortran Overhead ( 0 ) : 107.1167s + [COUNTERS] CudaCpp MEs ( 2 ) : 1318.3302s for 90112 events => throughput is 6.84E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.033155e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.990567e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.028364e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.033316e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 110.1179s - [COUNTERS] Fortran Overhead ( 0 ) : 50.7873s - [COUNTERS] CudaCpp MEs ( 2 ) : 59.3305s for 8192 events => throughput is 1.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 112.1583s + [COUNTERS] Fortran Overhead ( 0 ) : 51.1368s + [COUNTERS] CudaCpp MEs ( 2 ) : 61.0216s for 8192 events => throughput is 1.34E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436284111598E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 704.2691s - [COUNTERS] Fortran Overhead ( 0 ) : 54.2949s - [COUNTERS] CudaCpp MEs ( 2 ) : 649.9742s for 90112 events => throughput is 1.39E+02 events/s + [COUNTERS] PROGRAM TOTAL : 719.1467s + [COUNTERS] Fortran Overhead ( 0 ) : 54.6964s + [COUNTERS] CudaCpp MEs ( 2 ) : 664.4503s for 90112 events => throughput is 1.36E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635297e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.625730e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.628042e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.622146e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 48.2204s - [COUNTERS] Fortran Overhead ( 0 ) : 21.9374s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.2831s for 8192 events => throughput is 3.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 48.7268s + [COUNTERS] Fortran Overhead ( 0 ) : 22.2016s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.5252s for 8192 events => throughput is 3.09E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 314.3646s - [COUNTERS] Fortran Overhead ( 0 ) : 26.1162s - [COUNTERS] CudaCpp MEs ( 2 ) : 288.2484s for 90112 events => throughput is 3.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 312.7787s + [COUNTERS] Fortran Overhead ( 0 ) : 25.8939s + [COUNTERS] CudaCpp MEs ( 2 ) : 286.8848s for 90112 events => throughput is 3.14E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.810528e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761983e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.825565e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.775859e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 42.6054s - [COUNTERS] Fortran Overhead ( 0 ) : 19.4149s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.1905s for 8192 events => throughput is 3.53E+02 events/s + [COUNTERS] PROGRAM TOTAL : 42.1739s + [COUNTERS] Fortran Overhead ( 0 ) : 19.2356s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.9383s for 8192 events => throughput is 3.57E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 278.0352s - [COUNTERS] Fortran Overhead ( 0 ) : 23.0285s - [COUNTERS] CudaCpp MEs ( 2 ) : 255.0067s for 90112 events => throughput is 3.53E+02 events/s + [COUNTERS] PROGRAM TOTAL : 277.3137s + [COUNTERS] Fortran Overhead ( 0 ) : 23.0478s + [COUNTERS] CudaCpp MEs ( 2 ) : 254.2659s for 90112 events => throughput is 3.54E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.372569e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.346725e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.390556e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.360141e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 44.8365s - [COUNTERS] Fortran Overhead ( 0 ) : 21.9299s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.9066s for 8192 events => throughput is 3.58E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.3760s + [COUNTERS] Fortran Overhead ( 0 ) : 21.9554s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.4206s for 8192 events => throughput is 3.50E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 280.1799s - [COUNTERS] Fortran Overhead ( 0 ) : 25.4637s - [COUNTERS] CudaCpp MEs ( 2 ) : 254.7162s for 90112 events => throughput is 3.54E+02 events/s + [COUNTERS] PROGRAM TOTAL : 283.3743s + [COUNTERS] Fortran Overhead ( 0 ) : 25.7277s + [COUNTERS] CudaCpp MEs ( 2 ) : 257.6465s for 90112 events => throughput is 3.50E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.829822e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.787133e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.840554e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.796022e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 3.5385s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6761s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8624s for 8192 events => throughput is 9.50E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5891s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8672s for 8192 events => throughput is 9.45E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 15.7972s - [COUNTERS] Fortran Overhead ( 0 ) : 6.3222s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4751s for 90112 events => throughput is 9.51E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.8181s + [COUNTERS] Fortran Overhead ( 0 ) : 6.3338s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4843s for 90112 events => throughput is 9.50E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.416746e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.489325e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.082101e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.086868e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111361e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112402e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.159067e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.163573e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.107992e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112546e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.110248e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110187e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.116277e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113455e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.631653e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.651684e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index a53e3fae12..17d6db3749 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'make[1]: Nothing to be done for 'all'. - +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:24:05 +DATE: 2023-11-09_18:42:02 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3033s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2340s - [COUNTERS] Fortran MEs ( 1 ) : 0.0693s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3065s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2361s + [COUNTERS] Fortran MEs ( 1 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3022s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2323s - [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2994s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2293s + [COUNTERS] Fortran MEs ( 1 ) : 0.0701s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.1700s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4093s - [COUNTERS] Fortran MEs ( 1 ) : 0.7607s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1760s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4125s + [COUNTERS] Fortran MEs ( 1 ) : 0.7635s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3843s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3087s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0756s for 8192 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0760s for 8192 events => throughput is 1.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.3194s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4974s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8220s for 90112 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3472s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5183s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8289s for 90112 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.094809e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.089572e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.102064e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.081996e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,8 +210,8 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3132s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2728s + [COUNTERS] PROGRAM TOTAL : 0.3165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2761s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0404s for 8192 events => throughput is 2.03E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9124s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4682s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4441s for 90112 events => throughput is 2.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9307s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4770s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4537s for 90112 events => throughput is 1.99E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.028339e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.997353e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.046734e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.027039e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2792s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2558s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2572s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 8192 events => throughput is 3.52E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6989s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4448s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2540s for 90112 events => throughput is 3.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7189s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4610s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2579s for 90112 events => throughput is 3.49E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.552356e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.495576e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.523608e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.465419e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2748s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2772s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2561s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0211s for 8192 events => throughput is 3.89E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6795s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4499s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2296s for 90112 events => throughput is 3.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6775s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4482s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2293s for 90112 events => throughput is 3.93E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.842884e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.760921e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.986906e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.978083e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2949s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2643s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0307s for 8192 events => throughput is 2.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2977s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2665s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0311s for 8192 events => throughput is 2.63E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8124s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3444s for 90112 events => throughput is 2.62E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8099s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4686s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3413s for 90112 events => throughput is 2.64E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.637628e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.568787e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.616200e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.561174e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6561s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6555s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6636s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6629s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,8 +547,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8543s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8466s + [COUNTERS] PROGRAM TOTAL : 1.8698s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8622s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.18E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.567103e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.555687e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.093360e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.006338e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.536245e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.515172e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.495821e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.526258e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.517486e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.533570e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.749421e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.783496e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.528020e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.532375e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.773747e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.774257e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 8d2e1984e4..a15824491a 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:24:34 +DATE: 2023-11-09_18:42:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3107s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2407s - [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3036s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2341s + [COUNTERS] Fortran MEs ( 1 ) : 0.0695s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2965s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2276s - [COUNTERS] Fortran MEs ( 1 ) : 0.0689s for 8192 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3003s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2303s + [COUNTERS] Fortran MEs ( 1 ) : 0.0699s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.1583s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4006s - [COUNTERS] Fortran MEs ( 1 ) : 0.7577s for 90112 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2069s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4340s + [COUNTERS] Fortran MEs ( 1 ) : 0.7729s for 90112 events => throughput is 1.17E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050316058770007] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3794s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3064s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0730s for 8192 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3749s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3033s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0716s for 8192 events => throughput is 1.14E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182797520666] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.5714s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7649s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8065s for 90112 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2764s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4961s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7803s for 90112 events => throughput is 1.15E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.157942e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.160144e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.172513e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.172915e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050313133963987] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2818s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2568s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2845s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2592s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.23E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801179276862181] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7271s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4513s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2758s for 90112 events => throughput is 3.27E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7355s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4565s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2790s for 90112 events => throughput is 3.23E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.237957e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.194415e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.272249e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.097783e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2581s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2459s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2583s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2455s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.41E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.5730s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4380s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1350s for 90112 events => throughput is 6.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5855s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4486s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1369s for 90112 events => throughput is 6.58E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.530818e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.397086e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.313362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.385448e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2561s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2447s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0114s for 8192 events => throughput is 7.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2587s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2472s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0114s for 8192 events => throughput is 7.16E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.5576s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4335s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1240s for 90112 events => throughput is 7.27E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5778s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4506s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1272s for 90112 events => throughput is 7.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.360354e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.864944e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.523552e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.826763e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2629s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2685s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2527s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0158s for 8192 events => throughput is 5.17E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6820s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4993s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1827s for 90112 events => throughput is 4.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6231s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4511s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1720s for 90112 events => throughput is 5.24E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.733153e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.932364e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.992885e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.764394e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6547s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6542s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.60E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6581s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.57E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8561s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8501s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.53E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.9395s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9332s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0063s for 90112 events => throughput is 1.43E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.584146e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.830948e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.491850e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.471030e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.856033e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.130497e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.715106e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.724199e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.884678e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.113825e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.799322e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.756435e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.441795e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.594258e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.896004e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.959495e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 19ad35f402..3468beddc5 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-08_22:25:01 +DATE: 2023-11-09_18:42:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3005s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2310s - [COUNTERS] Fortran MEs ( 1 ) : 0.0695s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3047s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2346s + [COUNTERS] Fortran MEs ( 1 ) : 0.0701s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3006s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2306s - [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3065s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2359s + [COUNTERS] Fortran MEs ( 1 ) : 0.0705s for 8192 events => throughput is 1.16E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.1678s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4091s - [COUNTERS] Fortran MEs ( 1 ) : 0.7587s for 90112 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2175s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4409s + [COUNTERS] Fortran MEs ( 1 ) : 0.7766s for 90112 events => throughput is 1.16E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3817s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3071s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0746s for 8192 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3854s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3097s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0757s for 8192 events => throughput is 1.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182636608796] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.3333s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8245s for 90112 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3546s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5224s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8322s for 90112 events => throughput is 1.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.076052e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.083780e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.093800e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.087409e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657201] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3088s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2701s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0388s for 8192 events => throughput is 2.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3153s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2754s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0399s for 8192 events => throughput is 2.05E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182636608810] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8975s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4677s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4298s for 90112 events => throughput is 2.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9817s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5338s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4479s for 90112 events => throughput is 2.01E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.015345e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.021169e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.988560e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.048865e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2819s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2586s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 8192 events => throughput is 3.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2833s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2602s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0231s for 8192 events => throughput is 3.54E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7744s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2658s for 90112 events => throughput is 3.39E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7176s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2564s for 90112 events => throughput is 3.52E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.485253e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.495609e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.534003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.519650e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2862s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2651s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0211s for 8192 events => throughput is 3.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2747s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2542s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0205s for 8192 events => throughput is 3.99E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6779s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4535s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2245s for 90112 events => throughput is 4.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6910s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4668s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2243s for 90112 events => throughput is 4.02E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.974625e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.857183e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.057698e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.991341e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2985s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2664s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3063s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2736s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0327s for 8192 events => throughput is 2.50E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8085s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4572s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3513s for 90112 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8379s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4792s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3586s for 90112 events => throughput is 2.51E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.330681e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.546786e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.533534e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.503592e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333301029693] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6572s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6565s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6613s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6607s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182637219935] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8718s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8641s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8739s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8663s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.19E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.553454e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.582711e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.988956e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.041620e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.533250e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.534455e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.514727e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.524256e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.523754e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.513154e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.800142e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.797491e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.530148e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.528865e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.776434e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.779970e+07 ) sec^-1 TEST COMPLETED