From 40642132f3ef0d9b946c279b371671d8b59fff14 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 8 Nov 2023 19:30:49 +0100
Subject: [PATCH 01/14] [gpucpp] include Olivier's upstream "fix issue for
 large color matrix where a index issue was not spotted" for #781

I will regenerate and run tests
---
 MG5aMC/mg5amcnlo | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
index 49c93e01b8..d7a466dd54 160000
--- a/MG5aMC/mg5amcnlo
+++ b/MG5aMC/mg5amcnlo
@@ -1 +1 @@
-Subproject commit 49c93e01b8596cbdb4e65f628601de1e6f08c744
+Subproject commit d7a466dd54bb2f57564f5cc674f129ebf095c969

From 8c654cf0d35c332e3f4449301f8a8758cc3efce5 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 8 Nov 2023 19:43:13 +0100
Subject: [PATCH 02/14] [gpucpp] in CODEGEN output.py, add run_card_class to
 avoid crashes after Olivier's commit 8a18cc242 "better handling of the
 run_card"

  ./MG5_debug:AttributeError: 'PLUGIN_ProcessExporter' object has no attribute 'run_card_class'
---
 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index 8961036fb1..e3f88719f2 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -149,6 +149,9 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
     ###helas_exporter = None
     helas_exporter = model_handling.PLUGIN_GPUFOHelasCallWriter # this is one of the main fixes for issue #341!
 
+    # AV 08 Nov 2023 add run_card_class to avoid crashes after Olivier's commit 8a18cc242 "better handling of the run_card"
+    run_card_class = None
+
     # AV (default from OM's tutorial) - add a debug printout
     def __init__(self, *args, **kwargs):
         self.in_madevent_mode = False # see MR #747

From e20455c5070f97bc41bf4767ce97242d06ed0b21 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 8 Nov 2023 20:46:46 +0100
Subject: [PATCH 03/14] [gpucpp] regenerate all 15 processes after Olivier's
 latest upstream changes, which should fix #781

Apart from codegen logs, there are changes in banner.py, but also one change in matrix1.f for ggttggg.
This may indeed be the fix for #781
---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   |  36 +--
 .../ee_mumu.mad/bin/internal/banner.py        |   8 +-
 .../CODEGEN_cudacpp_ee_mumu_log.txt           |  26 +--
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       |  36 +--
 .../cudacpp/gg_tt.mad/bin/internal/banner.py  |   8 +-
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    |  24 +-
 .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt |  40 ++--
 .../gg_tt01g.mad/bin/internal/banner.py       |   8 +-
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     |  36 +--
 .../cudacpp/gg_ttg.mad/bin/internal/banner.py |   8 +-
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  |  26 +--
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   |  34 +--
 .../gg_ttgg.mad/bin/internal/banner.py        |   8 +-
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           |  30 +--
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt |  38 ++--
 .../SubProcesses/P1_gg_ttxggg/matrix1.f       | 164 +++++++-------
 .../gg_ttggg.mad/bin/internal/banner.py       |   8 +-
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          |  30 +--
 .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt     |  40 ++--
 .../cudacpp/gq_ttq.mad/bin/internal/banner.py |   8 +-
 .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt  |  36 +--
 .../CODEGEN_cudacpp_heft_gg_h_log.txt         |  20 +-
 .../CODEGEN_mad_pp_tt012j_log.txt             | 208 +++++++++---------
 .../pp_tt012j.mad/bin/internal/banner.py      |   8 +-
 24 files changed, 452 insertions(+), 436 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 16a5e3cdc9..d5d0a77b77 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005366802215576172 [0m
+[1;32mDEBUG: model prefixing  takes 0.005647420883178711 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -154,17 +154,17 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.005 s
+1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
@@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_epem_mupmum 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f2b9e5d1490> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8e1c521af0> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -183,27 +183,27 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 [1;34mWARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.102 s
+Wrote files for 8 helas calls in 0.100 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.201 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 3 routines in  0.203 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.258 s
+ALOHA: aloha creates 7 routines in  0.267 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -226,7 +226,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -241,16 +241,16 @@ patching file matrix1.f
 Hunk #3 succeeded at 230 (offset 9 lines).
 Hunk #4 succeeded at 267 (offset 18 lines).
 Hunk #5 succeeded at 312 (offset 18 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.973s
-user	0m1.681s
-sys	0m0.231s
+real	0m2.189s
+user	0m1.653s
+sys	0m0.232s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
index ef1bf58979..f0d38c2e5a 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index d48a5c4d44..ccb39ba2cc 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005533456802368164 [0m
+[1;32mDEBUG: model prefixing  takes 0.005671501159667969 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -160,28 +160,28 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.271 s
+ALOHA: aloha creates 4 routines in  0.272 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -198,9 +198,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 quit
 
-real	0m0.669s
-user	0m0.609s
-sys	0m0.053s
+real	0m0.795s
+user	0m0.698s
+sys	0m0.066s
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 2460cf072a..b0eb76c9f4 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005415439605712891 [0m
+[1;32mDEBUG: model prefixing  takes 0.005650758743286133 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.008 s
+1 processes with 3 diagrams generated in 0.009 s
 Total: 1 processes with 3 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1eba1925b0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f30964e0b20> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,23 +184,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.106 s
+Wrote files for 10 helas calls in 0.107 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.148 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 2 routines in  0.152 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.134 s
+ALOHA: aloha creates 4 routines in  0.140 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -219,7 +219,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -230,16 +230,16 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.735s
-user	0m1.507s
-sys	0m0.213s
+real	0m1.780s
+user	0m1.544s
+sys	0m0.218s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
index ef1bf58979..f0d38c2e5a 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index f9425b6b07..27709b8f4f 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0057506561279296875 [0m
+[1;32mDEBUG: model prefixing  takes 0.005872249603271484 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -161,22 +161,22 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
@@ -193,9 +193,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 quit
 
-real	0m0.684s
-user	0m0.481s
-sys	0m0.057s
+real	0m0.565s
+user	0m0.498s
+sys	0m0.040s
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 2db08eff10..0eefbc9b91 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005487680435180664 [0m
+[1;32mDEBUG: model prefixing  takes 0.005677461624145508 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,10 +170,10 @@ Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt01g 
 INFO: remove old information in CODEGEN_mad_gg_tt01g 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
@@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P2_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa2bded8520> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f361f0a0460> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -194,15 +194,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa2bdedb850> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f361f0a0580> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -211,14 +211,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 1 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s
-Wrote files for 46 helas calls in 0.247 s
+Wrote files for 46 helas calls in 0.249 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
@@ -226,7 +226,7 @@ ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
 ALOHA: aloha creates 5 routines in  0.331 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
@@ -257,7 +257,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -276,16 +276,16 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.331s
-user	0m2.084s
-sys	0m0.240s
+real	0m2.345s
+user	0m2.078s
+sys	0m0.243s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
index ef1bf58979..f0d38c2e5a 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index 5643c4439c..740186af78 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005489349365234375 [0m
+[1;32mDEBUG: model prefixing  takes 0.005747556686401367 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -162,10 +162,10 @@ Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Processing color information for process: g g > t t~ g @1 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f31aa39a130> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f96401c52e0> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,29 +184,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
-Wrote files for 36 helas calls in 0.151 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s
+Wrote files for 36 helas calls in 0.152 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.330 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 5 routines in  0.332 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.314 s
+ALOHA: aloha creates 10 routines in  0.326 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -230,7 +230,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -245,16 +245,16 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.275s
-user	0m1.969s
-sys	0m0.229s
+real	0m2.221s
+user	0m1.964s
+sys	0m0.245s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
index ef1bf58979..f0d38c2e5a 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 6c3bb7fa30..f795e1428d 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00565791130065918 [0m
+[1;32mDEBUG: model prefixing  takes 0.0055065155029296875 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -161,29 +161,29 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
 Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.331 s
+ALOHA: aloha creates 5 routines in  0.328 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -201,9 +201,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 quit
 
-real	0m0.836s
-user	0m0.731s
-sys	0m0.060s
+real	0m0.870s
+user	0m0.728s
+sys	0m0.055s
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 2401636ea2..374e4defbb 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005415916442871094 [0m
+[1;32mDEBUG: model prefixing  takes 0.005505084991455078 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -162,10 +162,10 @@ Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1
 INFO: Processing color information for process: g g > t t~ g g @1 
 INFO: Creating files in directory P1_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f094e346130> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f146e0a5730> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,22 +184,22 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.446 s
-Wrote files for 222 helas calls in 0.728 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.431 s
+Wrote files for 222 helas calls in 0.704 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.337 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 5 routines in  0.335 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
@@ -233,7 +233,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -248,16 +248,16 @@ Hunk #2 succeeded at 191 (offset 48 lines).
 Hunk #3 succeeded at 269 (offset 48 lines).
 Hunk #4 succeeded at 297 (offset 48 lines).
 Hunk #5 succeeded at 342 (offset 48 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.354s
-user	0m3.128s
-sys	0m0.221s
+real	0m3.310s
+user	0m3.061s
+sys	0m0.239s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
index ef1bf58979..f0d38c2e5a 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index d29fe4c726..b1a7fdc7e4 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005518913269042969 [0m
+[1;32mDEBUG: model prefixing  takes 0.005366086959838867 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.160 s
+1 processes with 123 diagrams generated in 0.161 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.430 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.431 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.324 s
+ALOHA: aloha creates 5 routines in  0.325 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -204,9 +204,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 quit
 
-real	0m1.541s
-user	0m1.392s
-sys	0m0.062s
+real	0m1.466s
+user	0m1.388s
+sys	0m0.064s
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index cd9806264d..af1d671efc 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005460023880004883 [0m
+[1;32mDEBUG: model prefixing  takes 0.005596160888671875 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.902 s
+1 processes with 1240 diagrams generated in 1.921 s
 Total: 1 processes with 1240 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg
 INFO: Computing Color-Flow optimization [15120 term] 
 INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f32a5870e20> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa193253fd0> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -186,29 +186,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.662 s
-Wrote files for 2281 helas calls in 18.810 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.769 s
+Wrote files for 2281 helas calls in 18.847 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.319 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 5 routines in  0.320 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.314 s
+ALOHA: aloha creates 10 routines in  0.317 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -235,7 +235,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -250,16 +250,16 @@ Hunk #2 succeeded at 255 (offset 112 lines).
 Hunk #3 succeeded at 333 (offset 112 lines).
 Hunk #4 succeeded at 361 (offset 112 lines).
 Hunk #5 succeeded at 406 (offset 112 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m29.634s
-user	0m29.131s
-sys	0m0.396s
+real	0m29.796s
+user	0m29.282s
+sys	0m0.413s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
index b8a6a894de..ac5285eda5 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
@@ -17540,7 +17540,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       TMP_JAMP(2914) = TMP_JAMP(2351) +  TMP_JAMP(1665)  ! used 2 times
       TMP_JAMP(2913) = TMP_JAMP(2310) +  TMP_JAMP(2134)  ! used 2 times
       TMP_JAMP(2912) = TMP_JAMP(2073) + ((-0.000000000000000D+00
-     $ ,1.000000000000000D+00)) * AMP(1483)  ! used 2 times
+     $ ,1.000000000000000D+00)) * AMP(1481)  ! used 2 times
       TMP_JAMP(3030) = TMP_JAMP(2935) + ((0.000000000000000D+00,
      $ -1.000000000000000D+00)) * TMP_JAMP(1044)  ! used 2 times
       TMP_JAMP(3029) = TMP_JAMP(2934) - TMP_JAMP(329)  ! used 2 times
@@ -17688,7 +17688,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +TMP_JAMP(360)+TMP_JAMP(485)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(558)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(576)+((0.000000000000000D+00
-     $ ,1.000000000000000D+00))*AMP(1489)+(-1.000000000000000D+00)
+     $ ,1.000000000000000D+00))*AMP(1485)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2911)+(-1.000000000000000D+00)*TMP_JAMP(2916)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2971)+TMP_JAMP(2994)
       JAMP(2,1) = (-1.000000000000000D+00)*AMP(242)+(
@@ -17698,7 +17698,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(557)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(576)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1580)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(1480)+TMP_JAMP(2655)+(-1.000000000000000D+00)
+     $ *AMP(1476)+TMP_JAMP(2655)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2913)+(-1.000000000000000D+00)*TMP_JAMP(2940)
       JAMP(3,1) = (-1.000000000000000D+00)*AMP(250)+(
      $ -1.000000000000000D+00)*TMP_JAMP(484)+((0.000000000000000D+00
@@ -17715,7 +17715,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00))*TMP_JAMP(575)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1589)+TMP_JAMP(1693)
      $ +TMP_JAMP(2050)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *AMP(1471)+(-1.000000000000000D+00)*TMP_JAMP(2353)
+     $ *AMP(1467)+(-1.000000000000000D+00)*TMP_JAMP(2353)
      $ +TMP_JAMP(2659)+TMP_JAMP(2905)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2955)+TMP_JAMP(2960)
       JAMP(5,1) = (-1.000000000000000D+00)*AMP(241)
@@ -17919,7 +17919,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(25,1) = (-1.000000000000000D+00)*TMP_JAMP(360)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(454)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(517)
-     $ +(-1.000000000000000D+00)*AMP(976)+(-1.000000000000000D+00)
+     $ +(-1.000000000000000D+00)*AMP(974)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1843)+TMP_JAMP(1859)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2085)+TMP_JAMP(2104)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2662)+TMP_JAMP(2851)
@@ -17929,7 +17929,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00))*TMP_JAMP(518)+(-1.000000000000000D+00)
      $ *TMP_JAMP(834)+(-1.000000000000000D+00)*TMP_JAMP(1019)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1374)
-     $ +(-1.000000000000000D+00)*AMP(967)+(-1.000000000000000D+00)
+     $ +(-1.000000000000000D+00)*AMP(965)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1479)+TMP_JAMP(1842)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2085)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2129)+(-1.000000000000000D+00)*TMP_JAMP(2648)
@@ -17940,7 +17940,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(513)+(-1.000000000000000D+00)*TMP_JAMP(809)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1028)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1373)+(-1.000000000000000D+00)
-     $ *AMP(975)+(-1.000000000000000D+00)*TMP_JAMP(1963)+TMP_JAMP(2060)
+     $ *AMP(973)+(-1.000000000000000D+00)*TMP_JAMP(1963)+TMP_JAMP(2060)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2104)+TMP_JAMP(2317)
      $ +TMP_JAMP(2387)+TMP_JAMP(2567)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2604)+TMP_JAMP(2796)+TMP_JAMP(2811)+(
@@ -17950,7 +17950,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(470)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(514)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(735)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1392)+(-1.000000000000000D+00)*AMP(958)+TMP_JAMP(1448)
+     $ *TMP_JAMP(1392)+(-1.000000000000000D+00)*AMP(956)+TMP_JAMP(1448)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1839)+((0.000000000000000D
      $ +00,1.000000000000000D+00))*TMP_JAMP(1846)+(-1.000000000000000D
      $ +00)*TMP_JAMP(1919)+TMP_JAMP(1963)+(-1.000000000000000D+00)
@@ -17960,13 +17960,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(29,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(314)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(462)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(717)+(-1.000000000000000D+00)*AMP(966)+TMP_JAMP(1709)
+     $ *TMP_JAMP(717)+(-1.000000000000000D+00)*AMP(964)+TMP_JAMP(1709)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1874)+TMP_JAMP(2061)
-     $ +TMP_JAMP(2129)+AMP(1642)+TMP_JAMP(2445)+(-1.000000000000000D
+     $ +TMP_JAMP(2129)+AMP(1638)+TMP_JAMP(2445)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2493)+TMP_JAMP(2647)+TMP_JAMP(2985)+TMP_JAMP(2996)
       JAMP(30,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(320)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(520)+(-1.000000000000000D+00)*AMP(957)+(
+     $ *TMP_JAMP(520)+(-1.000000000000000D+00)*AMP(955)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1840)+TMP_JAMP(1874)
      $ +TMP_JAMP(1919)+TMP_JAMP(1966)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2064)+TMP_JAMP(2250)+(
@@ -17974,7 +17974,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +TMP_JAMP(3000)+TMP_JAMP(3007)
       JAMP(31,1) = TMP_JAMP(804)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1391)+(-1.000000000000000D+00)
-     $ *AMP(979)+TMP_JAMP(1857)+TMP_JAMP(1894)+TMP_JAMP(2130)
+     $ *AMP(977)+TMP_JAMP(1857)+TMP_JAMP(1894)+TMP_JAMP(2130)
      $ +TMP_JAMP(2609)+(-1.000000000000000D+00)*TMP_JAMP(2816)
      $ +TMP_JAMP(2825)+(-1.000000000000000D+00)*TMP_JAMP(2863)+(
      $ -1.000000000000000D+00)*TMP_JAMP(3018)
@@ -17982,7 +17982,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(949)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1147)+TMP_JAMP(1280)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1374)+(-1.000000000000000D+00)
-     $ *AMP(970)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *AMP(968)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2067)+(-1.000000000000000D+00)*TMP_JAMP(2130)
      $ +TMP_JAMP(2333)+(-1.000000000000000D+00)*TMP_JAMP(2542)
      $ +TMP_JAMP(2713)+(-1.000000000000000D+00)*TMP_JAMP(2763)
@@ -17991,7 +17991,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(33,1) = (-1.000000000000000D+00)*TMP_JAMP(1102)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1256)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1391)+(-1.000000000000000D+00)
-     $ *AMP(977)+(-1.000000000000000D+00)*TMP_JAMP(1688)+(
+     $ *AMP(975)+(-1.000000000000000D+00)*TMP_JAMP(1688)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2556)+TMP_JAMP(2811)
      $ +TMP_JAMP(2817)+TMP_JAMP(2882)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2976)+(-1.000000000000000D+00)
@@ -18009,7 +18009,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1033)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1152)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1155)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1224)+(-1.000000000000000D+00)*AMP(968)+TMP_JAMP(1582)
+     $ *TMP_JAMP(1224)+(-1.000000000000000D+00)*AMP(966)+TMP_JAMP(1582)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2006)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2105)
      $ +TMP_JAMP(2514)+TMP_JAMP(2546)+TMP_JAMP(2695)+(
@@ -18029,7 +18029,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00))*TMP_JAMP(910)+TMP_JAMP(1277)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1346)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1373)
-     $ +(-1.000000000000000D+00)*AMP(980)+TMP_JAMP(1883)
+     $ +(-1.000000000000000D+00)*AMP(978)+TMP_JAMP(1883)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2066)
      $ +TMP_JAMP(2128)+TMP_JAMP(2609)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2846)+(-1.000000000000000D+00)*TMP_JAMP(2899)+(
@@ -18040,7 +18040,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00))*TMP_JAMP(1143)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1148)+((0.000000000000000D
      $ +00,-1.000000000000000D+00))*TMP_JAMP(1392)+(
-     $ -1.000000000000000D+00)*AMP(961)+(-1.000000000000000D+00)
+     $ -1.000000000000000D+00)*AMP(959)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2128)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2138)+TMP_JAMP(2296)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2483)+(-1.000000000000000D+00)*TMP_JAMP(2535)+(
@@ -18050,7 +18050,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(1020)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1039)+TMP_JAMP(1100)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1255)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1346)+(-1.000000000000000D+00)*AMP(978)+TMP_JAMP(1686)
+     $ *TMP_JAMP(1346)+(-1.000000000000000D+00)*AMP(976)+TMP_JAMP(1686)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1799)+((0.000000000000000D
      $ +00,1.000000000000000D+00))*TMP_JAMP(1988)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2497)+TMP_JAMP(2591)+(-1.000000000000000D+00)
@@ -18072,7 +18072,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(1159)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1211)+(-1.000000000000000D
      $ +00)*TMP_JAMP(1270)+((0.000000000000000D+00,-1.000000000000000D
-     $ +00))*TMP_JAMP(1311)+(-1.000000000000000D+00)*AMP(959)
+     $ +00))*TMP_JAMP(1311)+(-1.000000000000000D+00)*AMP(957)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1784)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1868)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1939)+((0.000000000000000D
@@ -18094,11 +18094,11 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(43,1) = TMP_JAMP(678)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(688)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(949)+TMP_JAMP(1387)+(
-     $ -1.000000000000000D+00)*AMP(971)+TMP_JAMP(2125)+TMP_JAMP(2127)
+     $ -1.000000000000000D+00)*AMP(969)+TMP_JAMP(2125)+TMP_JAMP(2127)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2481)+TMP_JAMP(2497)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2722)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2897)+(-1.000000000000000D+00)*TMP_JAMP(2996)
-      JAMP(44,1) = TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(962)+(
+      JAMP(44,1) = TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(960)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2126)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2127)+(-1.000000000000000D+00)*TMP_JAMP(2535)
      $ +TMP_JAMP(2556)+(-1.000000000000000D+00)*TMP_JAMP(2730)+(
@@ -18107,7 +18107,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(45,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(728)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(874)+TMP_JAMP(1382)+(-1.000000000000000D+00)
-     $ *TMP_JAMP(1387)+(-1.000000000000000D+00)*AMP(969)+TMP_JAMP(1824)
+     $ *TMP_JAMP(1387)+(-1.000000000000000D+00)*AMP(967)+TMP_JAMP(1824)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2088)+((0.000000000000000D
      $ +00,1.000000000000000D+00))*TMP_JAMP(2105)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2327)+(-1.000000000000000D+00)*TMP_JAMP(2608)
@@ -18127,7 +18127,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(47,1) = TMP_JAMP(1129)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1158)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1303)+(-1.000000000000000D
-     $ +00)*TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(960)
+     $ +00)*TMP_JAMP(1384)+(-1.000000000000000D+00)*AMP(958)
      $ +TMP_JAMP(1563)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(2086)+(-1.000000000000000D+00)*TMP_JAMP(2089)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2364)+TMP_JAMP(2466)+(
@@ -18146,21 +18146,21 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1900)+TMP_JAMP(1972)+TMP_JAMP(2677)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2897)+TMP_JAMP(2954)
       JAMP(49,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1393)+(-1.000000000000000D+00)*AMP(1405)
+     $ *TMP_JAMP(1393)+(-1.000000000000000D+00)*AMP(1403)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1746)
      $ +TMP_JAMP(1892)+(-1.000000000000000D+00)*TMP_JAMP(1939)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2136)
      $ +TMP_JAMP(2579)+TMP_JAMP(2630)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2836)+TMP_JAMP(2837)+TMP_JAMP(2860)+TMP_JAMP(2990)
       JAMP(50,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1405)+(-1.000000000000000D+00)*AMP(1399)+(
+     $ *TMP_JAMP(1405)+(-1.000000000000000D+00)*AMP(1397)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1892)+TMP_JAMP(1938)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1977)
      $ +TMP_JAMP(2026)+(-1.000000000000000D+00)*TMP_JAMP(2620)
      $ +TMP_JAMP(2731)+TMP_JAMP(2783)+TMP_JAMP(2938)+TMP_JAMP(2986)
       JAMP(51,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1394)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1397)+(-1.000000000000000D+00)*AMP(1404)
+     $ *TMP_JAMP(1397)+(-1.000000000000000D+00)*AMP(1402)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1737)
      $ +TMP_JAMP(1891)+TMP_JAMP(1937)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2136)+TMP_JAMP(2575)
@@ -18168,11 +18168,11 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2895)
       JAMP(52,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1176)+TMP_JAMP(1385)+(-1.000000000000000D+00)
-     $ *AMP(1020)+(-1.000000000000000D+00)*TMP_JAMP(1619)+(
+     $ *AMP(1018)+(-1.000000000000000D+00)*TMP_JAMP(1619)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1891)+TMP_JAMP(2145)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2531)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2853)+TMP_JAMP(2938)+TMP_JAMP(2988)+TMP_JAMP(3009)
-      JAMP(53,1) = TMP_JAMP(1415)+(-1.000000000000000D+00)*AMP(1398)
+      JAMP(53,1) = TMP_JAMP(1415)+(-1.000000000000000D+00)*AMP(1396)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1744)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1811)+TMP_JAMP(1890)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1977)
@@ -18184,7 +18184,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(721)+(-1.000000000000000D+00)*TMP_JAMP(1263)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1295)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1375)
-     $ +(-1.000000000000000D+00)*AMP(1019)+(-1.000000000000000D+00)
+     $ +(-1.000000000000000D+00)*AMP(1017)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1655)+(-1.000000000000000D+00)*TMP_JAMP(1890)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1986)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2145)+TMP_JAMP(2492)
@@ -18194,7 +18194,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(55,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1063)+TMP_JAMP(1141)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1177)+(-1.000000000000000D+00)
-     $ *AMP(1408)+(-1.000000000000000D+00)*TMP_JAMP(1894)+(
+     $ *AMP(1406)+(-1.000000000000000D+00)*TMP_JAMP(1894)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2075)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2108)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2578)+TMP_JAMP(2821)+(-1.000000000000000D+00)
@@ -18203,7 +18203,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(56,1) = TMP_JAMP(647)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1168)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1205)+(-1.000000000000000D+00)
-     $ *AMP(1402)+TMP_JAMP(2047)+((0.000000000000000D+00,
+     $ *AMP(1400)+TMP_JAMP(2047)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2108)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2452)+TMP_JAMP(2814)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2940)+(-1.000000000000000D+00)*TMP_JAMP(2957)+(
@@ -18213,7 +18213,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1172)+TMP_JAMP(1257)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1301)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1340)+(-1.000000000000000D+00)
-     $ *AMP(1406)+TMP_JAMP(1677)+((0.000000000000000D+00
+     $ *AMP(1404)+TMP_JAMP(1677)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2142)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2820)+TMP_JAMP(2832)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2909)+((0.000000000000000D+00,-1.000000000000000D+00))
@@ -18233,7 +18233,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(893)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1169)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1209)
-     $ +TMP_JAMP(1377)+(-1.000000000000000D+00)*AMP(1400)
+     $ +TMP_JAMP(1377)+(-1.000000000000000D+00)*AMP(1398)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1776)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2149)+TMP_JAMP(2729)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2819)+(-1.000000000000000D+00)
@@ -18251,7 +18251,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2879)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2983)
       JAMP(61,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1394)+(-1.000000000000000D+00)*AMP(1409)
+     $ *TMP_JAMP(1394)+(-1.000000000000000D+00)*AMP(1407)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2106)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2319)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2805)+(-1.000000000000000D+00)*TMP_JAMP(2881)
@@ -18261,14 +18261,14 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(1231)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1288)+((0.000000000000000D
      $ +00,1.000000000000000D+00))*TMP_JAMP(1342)+(-1.000000000000000D
-     $ +00)*AMP(1022)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ +00)*AMP(1020)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(2106)+(-1.000000000000000D+00)*TMP_JAMP(2146)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2271)+TMP_JAMP(2363)
      $ +TMP_JAMP(2437)+TMP_JAMP(2562)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2745)+(-1.000000000000000D+00)*TMP_JAMP(2988)+(
      $ -1.000000000000000D+00)*TMP_JAMP(3022)
       JAMP(63,1) = (-1.000000000000000D+00)*TMP_JAMP(1380)+(
-     $ -1.000000000000000D+00)*AMP(1407)+TMP_JAMP(1952)
+     $ -1.000000000000000D+00)*AMP(1405)+TMP_JAMP(1952)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2142)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2341)+TMP_JAMP(2452)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2687)+(-1.000000000000000D+00)
@@ -18278,7 +18278,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(311)
      $ +(-1.000000000000000D+00)*TMP_JAMP(421)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(501)+TMP_JAMP(1380)+(
-     $ -1.000000000000000D+00)*AMP(947)+((0.000000000000000D+00,
+     $ -1.000000000000000D+00)*AMP(945)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1544)+TMP_JAMP(1683)
      $ +TMP_JAMP(1801)+(-1.000000000000000D+00)*TMP_JAMP(2450)
      $ +TMP_JAMP(2586)+TMP_JAMP(2720)+TMP_JAMP(2869)
@@ -18287,7 +18287,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(65,1) = TMP_JAMP(579)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1008)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1049)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1218)+(-1.000000000000000D+00)*AMP(1017)
+     $ *TMP_JAMP(1218)+(-1.000000000000000D+00)*AMP(1015)
      $ +TMP_JAMP(1611)+TMP_JAMP(1862)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1901)+TMP_JAMP(2273)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2441)+TMP_JAMP(3022)
@@ -18304,7 +18304,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2584)+TMP_JAMP(2887)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2914)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2975)
-      JAMP(67,1) = (-1.000000000000000D+00)*AMP(1403)+(
+      JAMP(67,1) = (-1.000000000000000D+00)*AMP(1401)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1626)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2144)+(-1.000000000000000D+00)*TMP_JAMP(2452)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2678)+TMP_JAMP(2768)
@@ -18314,13 +18314,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1055)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1058)+TMP_JAMP(1275)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1342)+(-1.000000000000000D+00)
-     $ *AMP(1021)+((0.000000000000000D+00,-1.000000000000000D+00))
+     $ *AMP(1019)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2116)+TMP_JAMP(2144)+TMP_JAMP(2297)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2341)+TMP_JAMP(2426)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2486)+TMP_JAMP(2794)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2999)+TMP_JAMP(3016)
       JAMP(69,1) = (-1.000000000000000D+00)*TMP_JAMP(1413)+(
-     $ -1.000000000000000D+00)*AMP(1401)+TMP_JAMP(2042)+TMP_JAMP(2149)
+     $ -1.000000000000000D+00)*AMP(1399)+TMP_JAMP(2042)+TMP_JAMP(2149)
      $ +TMP_JAMP(2578)+TMP_JAMP(2679)+TMP_JAMP(2731)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2800)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2883)+TMP_JAMP(3004)
@@ -18337,7 +18337,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2961)
       JAMP(71,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1176)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(1296)+(-1.000000000000000D+00)*AMP(1018)
+     $ *TMP_JAMP(1296)+(-1.000000000000000D+00)*AMP(1016)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2091)
      $ +TMP_JAMP(2343)+(-1.000000000000000D+00)*TMP_JAMP(2800)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2945)+(-1.000000000000000D+00)
@@ -18359,11 +18359,11 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(1761)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1764)+TMP_JAMP(1895)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1932)+(-1.000000000000000D+00)
-     $ *AMP(1428)+TMP_JAMP(2569)+(-1.000000000000000D+00)
+     $ *AMP(1424)+TMP_JAMP(2569)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2652)+TMP_JAMP(2683)+TMP_JAMP(2786)+TMP_JAMP(2796)
      $ +TMP_JAMP(2902)
       JAMP(74,1) = TMP_JAMP(2027)+TMP_JAMP(2042)+(-1.000000000000000D
-     $ +00)*AMP(1422)+TMP_JAMP(2383)+TMP_JAMP(2580)+(
+     $ +00)*AMP(1418)+TMP_JAMP(2383)+TMP_JAMP(2580)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2683)+TMP_JAMP(2735)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2798)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2932)+TMP_JAMP(2942)+TMP_JAMP(3008)
@@ -18372,14 +18372,14 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1383)+(-1.000000000000000D+00)*TMP_JAMP(1386)
      $ +TMP_JAMP(1860)+(-1.000000000000000D+00)*TMP_JAMP(1863)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1895)+TMP_JAMP(1899)+(
-     $ -1.000000000000000D+00)*AMP(1427)+TMP_JAMP(2627)+TMP_JAMP(2780)
+     $ -1.000000000000000D+00)*AMP(1423)+TMP_JAMP(2627)+TMP_JAMP(2780)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2895)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2936)
       JAMP(76,1) = (-1.000000000000000D+00)*TMP_JAMP(1038)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1107)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1185)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1203)+(-1.000000000000000D
-     $ +00)*AMP(1029)+(-1.000000000000000D+00)*TMP_JAMP(1899)
+     $ +00)*AMP(1027)+(-1.000000000000000D+00)*TMP_JAMP(1899)
      $ +TMP_JAMP(2043)+(-1.000000000000000D+00)*TMP_JAMP(2095)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2328)+TMP_JAMP(2458)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2611)+TMP_JAMP(2649)+(
@@ -18388,13 +18388,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +TMP_JAMP(3009)
       JAMP(77,1) = (-1.000000000000000D+00)*TMP_JAMP(800)
      $ +TMP_JAMP(1631)+(-1.000000000000000D+00)*TMP_JAMP(1812)
-     $ +TMP_JAMP(1898)+(-1.000000000000000D+00)*AMP(1421)+(
+     $ +TMP_JAMP(1898)+(-1.000000000000000D+00)*AMP(1417)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2332)+TMP_JAMP(2537)
      $ +TMP_JAMP(2932)+(-1.000000000000000D+00)*TMP_JAMP(2936)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2972)+TMP_JAMP(3023)
       JAMP(78,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1216)+(-1.000000000000000D+00)*TMP_JAMP(1264)+(
-     $ -1.000000000000000D+00)*AMP(1028)+(-1.000000000000000D+00)
+     $ -1.000000000000000D+00)*AMP(1026)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1494)+(-1.000000000000000D+00)*TMP_JAMP(1633)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1764)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1898)+TMP_JAMP(2095)+(
@@ -18408,7 +18408,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1200)+TMP_JAMP(1626)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1849)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1883)+(-1.000000000000000D+00)*TMP_JAMP(2036)+(
-     $ -1.000000000000000D+00)*AMP(1431)+TMP_JAMP(2489)+(
+     $ -1.000000000000000D+00)*AMP(1427)+TMP_JAMP(2489)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2505)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2570)+(-1.000000000000000D+00)*TMP_JAMP(2630)
      $ +TMP_JAMP(2645)+TMP_JAMP(2686)+(-1.000000000000000D+00)
@@ -18417,7 +18417,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(1207)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1291)+TMP_JAMP(2037)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2138)
-     $ +(-1.000000000000000D+00)*AMP(1425)+(-1.000000000000000D+00)
+     $ +(-1.000000000000000D+00)*AMP(1421)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2250)+(-1.000000000000000D+00)*TMP_JAMP(2381)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2686)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2699)+TMP_JAMP(2905)+TMP_JAMP(2987)+(
@@ -18428,7 +18428,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(1349)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1987)+TMP_JAMP(2020)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2141)+(-1.000000000000000D+00)
-     $ *AMP(1429)+(-1.000000000000000D+00)*TMP_JAMP(2773)
+     $ *AMP(1425)+(-1.000000000000000D+00)*TMP_JAMP(2773)
      $ +TMP_JAMP(2864)+(-1.000000000000000D+00)*TMP_JAMP(2909)
      $ +TMP_JAMP(3011)
       JAMP(82,1) = (-1.000000000000000D+00)*AMP(404)
@@ -18448,7 +18448,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1212)+TMP_JAMP(1268)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1868)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(2011)+(-1.000000000000000D
-     $ +00)*AMP(1423)+TMP_JAMP(2451)+TMP_JAMP(2699)+(
+     $ +00)*AMP(1419)+TMP_JAMP(2451)+TMP_JAMP(2699)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2772)+TMP_JAMP(2917)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2939)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2965)
@@ -18465,7 +18465,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2761)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2880)+(-1.000000000000000D+00)*TMP_JAMP(2922)
      $ +TMP_JAMP(2965)
-      JAMP(85,1) = TMP_JAMP(1386)+(-1.000000000000000D+00)*AMP(1432)+(
+      JAMP(85,1) = TMP_JAMP(1386)+(-1.000000000000000D+00)*AMP(1428)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2372)+TMP_JAMP(2387)
      $ +TMP_JAMP(2393)+TMP_JAMP(2427)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2467)+(-1.000000000000000D+00)*TMP_JAMP(2505)+(
@@ -18478,14 +18478,14 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(983)+TMP_JAMP(1107)+TMP_JAMP(1127)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1204)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1290)
-     $ +(-1.000000000000000D+00)*AMP(1031)+TMP_JAMP(2146)+(
+     $ +(-1.000000000000000D+00)*AMP(1029)+TMP_JAMP(2146)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2480)+TMP_JAMP(2499)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2721)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2896)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(2977)+(-1.000000000000000D+00)*TMP_JAMP(2995)
       JAMP(87,1) = (-1.000000000000000D+00)*TMP_JAMP(1379)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1953)+TMP_JAMP(2141)+(
-     $ -1.000000000000000D+00)*AMP(1430)+TMP_JAMP(2247)+TMP_JAMP(2403)
+     $ -1.000000000000000D+00)*AMP(1426)+TMP_JAMP(2247)+TMP_JAMP(2403)
      $ +TMP_JAMP(2882)+TMP_JAMP(2902)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2929)+TMP_JAMP(3005)
       JAMP(88,1) = (-1.000000000000000D+00)*AMP(405)+(
@@ -18504,7 +18504,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(476)+TMP_JAMP(1007)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1052)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1219)
-     $ +(-1.000000000000000D+00)*AMP(1026)+TMP_JAMP(1696)+(
+     $ +(-1.000000000000000D+00)*AMP(1024)+TMP_JAMP(1696)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1722)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1858)+((0.000000000000000D+00
      $ ,-1.000000000000000D+00))*TMP_JAMP(1901)+(-1.000000000000000D
@@ -18526,22 +18526,22 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(3003)
       JAMP(91,1) = TMP_JAMP(647)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(2113)+(-1.000000000000000D+00)
-     $ *AMP(1426)+TMP_JAMP(2369)+TMP_JAMP(2502)+(-1.000000000000000D
+     $ *AMP(1422)+TMP_JAMP(2369)+TMP_JAMP(2502)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2941)+(-1.000000000000000D+00)*TMP_JAMP(3023)+(
      $ -1.000000000000000D+00)*TMP_JAMP(3024)
       JAMP(92,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(985)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1204)+TMP_JAMP(1261)+TMP_JAMP(1280)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1350)
-     $ +(-1.000000000000000D+00)*AMP(1030)+((0.000000000000000D+00
+     $ +(-1.000000000000000D+00)*AMP(1028)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2113)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2143)+TMP_JAMP(2334)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2545)+TMP_JAMP(2714)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2762)+TMP_JAMP(2857)+(-1.000000000000000D+00)
      $ *TMP_JAMP(3002)
       JAMP(93,1) = ((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(1769)+(-1.000000000000000D+00)*AMP(1424)+(
-     $ -1.000000000000000D+00)*AMP(1893)+TMP_JAMP(2465)+TMP_JAMP(2476)
+     $ *TMP_JAMP(1769)+(-1.000000000000000D+00)*AMP(1420)+(
+     $ -1.000000000000000D+00)*AMP(1889)+TMP_JAMP(2465)+TMP_JAMP(2476)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2625)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2917)+TMP_JAMP(2928)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2931)+TMP_JAMP(2950)+TMP_JAMP(3024)
@@ -18558,7 +18558,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +00))*TMP_JAMP(237)+(-1.000000000000000D+00)*TMP_JAMP(1043)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1250)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1350)
-     $ +(-1.000000000000000D+00)*AMP(1027)+TMP_JAMP(2135)
+     $ +(-1.000000000000000D+00)*AMP(1025)+TMP_JAMP(2135)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2148)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2355)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2381)+TMP_JAMP(2757)+TMP_JAMP(2779)+(
@@ -18578,13 +18578,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1399)+(-1.000000000000000D+00)*TMP_JAMP(1953)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2025)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2121)+(-1.000000000000000D+00)
-     $ *AMP(1449)+TMP_JAMP(2234)+TMP_JAMP(2634)+(-1.000000000000000D
+     $ *AMP(1445)+TMP_JAMP(2234)+TMP_JAMP(2634)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2671)+TMP_JAMP(2689)+TMP_JAMP(2727)+TMP_JAMP(2866)
      $ +TMP_JAMP(3012)
       JAMP(98,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1401)+TMP_JAMP(1952)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2022)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2118)+(-1.000000000000000D+00)*AMP(1443)
+     $ *TMP_JAMP(2118)+(-1.000000000000000D+00)*AMP(1439)
      $ +TMP_JAMP(2390)+(-1.000000000000000D+00)*TMP_JAMP(2408)
      $ +TMP_JAMP(2456)+(-1.000000000000000D+00)*TMP_JAMP(2689)
      $ +TMP_JAMP(2841)+TMP_JAMP(2908)+(-1.000000000000000D+00)
@@ -18593,13 +18593,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(1018)+TMP_JAMP(1376)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(1378)+TMP_JAMP(1913)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2121)
-     $ +TMP_JAMP(2124)+(-1.000000000000000D+00)*AMP(1448)+(
+     $ +TMP_JAMP(2124)+(-1.000000000000000D+00)*AMP(1444)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2490)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2638)+TMP_JAMP(2765)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2843)+TMP_JAMP(2901)
       JAMP(100,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(746)+(-1.000000000000000D+00)*TMP_JAMP(1278)+(
-     $ -1.000000000000000D+00)*AMP(1038)+(-1.000000000000000D+00)
+     $ -1.000000000000000D+00)*AMP(1036)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1913)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2012)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2119)+(-1.000000000000000D+00)*TMP_JAMP(2499)
@@ -18608,13 +18608,13 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ -1.000000000000000D+00)*TMP_JAMP(2952)+TMP_JAMP(3020)
       JAMP(101,1) = TMP_JAMP(1910)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(2118)+TMP_JAMP(2124)+(
-     $ -1.000000000000000D+00)*AMP(1442)+AMP(1813)+TMP_JAMP(2342)+(
+     $ -1.000000000000000D+00)*AMP(1438)+AMP(1809)+TMP_JAMP(2342)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2549)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2842)+(-1.000000000000000D+00)*TMP_JAMP(2867)
      $ +TMP_JAMP(2984)+TMP_JAMP(3014)
       JAMP(102,1) = (-1.000000000000000D+00)*TMP_JAMP(1030)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1404)
-     $ +(-1.000000000000000D+00)*AMP(1037)+(-1.000000000000000D+00)
+     $ +(-1.000000000000000D+00)*AMP(1035)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1809)+(-1.000000000000000D+00)*TMP_JAMP(1910)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2018)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2119)
@@ -18624,7 +18624,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(103,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1252)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1388)+(-1.000000000000000D+00)*TMP_JAMP(2125)+(
-     $ -1.000000000000000D+00)*AMP(1452)+TMP_JAMP(2430)+(
+     $ -1.000000000000000D+00)*AMP(1448)+TMP_JAMP(2430)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2447)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2478)+(-1.000000000000000D+00)*TMP_JAMP(2633)
      $ +TMP_JAMP(2664)+(-1.000000000000000D+00)*TMP_JAMP(2848)
@@ -18634,7 +18634,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ ,1.000000000000000D+00))*TMP_JAMP(845)+((0.000000000000000D+00
      $ ,1.000000000000000D+00))*TMP_JAMP(962)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(1228)+TMP_JAMP(2126)+(
-     $ -1.000000000000000D+00)*AMP(1446)+(-1.000000000000000D+00)
+     $ -1.000000000000000D+00)*AMP(1442)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2440)+(-1.000000000000000D+00)*TMP_JAMP(2457)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2580)+TMP_JAMP(2739)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2830)+(-1.000000000000000D+00)
@@ -18644,7 +18644,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(989)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(1388)+(-1.000000000000000D+00)*TMP_JAMP(1670)
      $ +TMP_JAMP(2088)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1450)
+     $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1446)
      $ +TMP_JAMP(2901)+(-1.000000000000000D+00)*TMP_JAMP(2937)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2944)+(-1.000000000000000D+00)
      $ *TMP_JAMP(3026)
@@ -18666,7 +18666,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1304)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1914)+TMP_JAMP(2089)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2137)
-     $ +(-1.000000000000000D+00)*AMP(1444)+TMP_JAMP(2576)
+     $ +(-1.000000000000000D+00)*AMP(1440)+TMP_JAMP(2576)
      $ +TMP_JAMP(2828)+(-1.000000000000000D+00)*TMP_JAMP(2939)+(
      $ -1.000000000000000D+00)*TMP_JAMP(3026)
       JAMP(108,1) = (-1.000000000000000D+00)*AMP(411)
@@ -18674,7 +18674,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(301)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(334)
      $ +(-1.000000000000000D+00)*TMP_JAMP(437)+TMP_JAMP(440)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(596)+(
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(594)+(
      $ -1.000000000000000D+00)*TMP_JAMP(781)+(-1.000000000000000D+00)
      $ *TMP_JAMP(817)+TMP_JAMP(846)+((0.000000000000000D+00,
      $ -1.000000000000000D+00))*TMP_JAMP(977)+((0.000000000000000D+00,
@@ -18689,7 +18689,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(1378)
      $ +(-1.000000000000000D+00)*TMP_JAMP(1884)+(-1.000000000000000D
      $ +00)*TMP_JAMP(2039)+((0.000000000000000D+00,-1.000000000000000D
-     $ +00))*TMP_JAMP(2068)+(-1.000000000000000D+00)*AMP(1453)+(
+     $ +00))*TMP_JAMP(2068)+(-1.000000000000000D+00)*AMP(1449)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2357)+TMP_JAMP(2523)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2573)+TMP_JAMP(2678)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2766)+TMP_JAMP(2775)+(
@@ -18697,7 +18697,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(110,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(990)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1248)+TMP_JAMP(1277)+(-1.000000000000000D+00)
-     $ *AMP(1040)+((0.000000000000000D+00,1.000000000000000D+00))
+     $ *AMP(1038)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(1852)+TMP_JAMP(1884)+TMP_JAMP(2040)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2116)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2338)+(-1.000000000000000D
@@ -18705,7 +18705,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +TMP_JAMP(3015)+(-1.000000000000000D+00)*TMP_JAMP(3020)
       JAMP(111,1) = TMP_JAMP(1516)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1932)+((0.000000000000000D+00,1.000000000000000D+00))
-     $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1451)+(
+     $ *TMP_JAMP(2117)+(-1.000000000000000D+00)*AMP(1447)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2371)+TMP_JAMP(2519)
      $ +TMP_JAMP(2572)+(-1.000000000000000D+00)*TMP_JAMP(2679)
      $ +TMP_JAMP(2695)+TMP_JAMP(2787)+((0.000000000000000D+00
@@ -18724,7 +18724,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(78)+((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(321)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(739)+(-1.000000000000000D+00)*TMP_JAMP(1272)+(
-     $ -1.000000000000000D+00)*AMP(1035)+(-1.000000000000000D+00)
+     $ -1.000000000000000D+00)*AMP(1033)+(-1.000000000000000D+00)
      $ *TMP_JAMP(1810)+((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(2091)+TMP_JAMP(2803)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2933)+TMP_JAMP(2991)+(-1.000000000000000D+00)
@@ -18745,15 +18745,15 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(2915)+(-1.000000000000000D+00)*TMP_JAMP(2991)
       JAMP(115,1) = ((0.000000000000000D+00,-1.000000000000000D+00))
      $ *TMP_JAMP(589)+((0.000000000000000D+00,-1.000000000000000D+00))
-     $ *TMP_JAMP(2122)+(-1.000000000000000D+00)*AMP(1447)+(
+     $ *TMP_JAMP(2122)+(-1.000000000000000D+00)*AMP(1443)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2373)+TMP_JAMP(2550)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2574)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2582)+(-1.000000000000000D+00)*TMP_JAMP(2626)
      $ +TMP_JAMP(2629)+TMP_JAMP(2941)+(-1.000000000000000D+00)
      $ *TMP_JAMP(3014)
-      JAMP(116,1) = TMP_JAMP(1279)+(-1.000000000000000D+00)*AMP(1039)
+      JAMP(116,1) = TMP_JAMP(1279)+(-1.000000000000000D+00)*AMP(1037)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2122)
-     $ +TMP_JAMP(2143)+AMP(1669)+(-1.000000000000000D+00)
+     $ +TMP_JAMP(2143)+AMP(1665)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2371)+(-1.000000000000000D+00)*TMP_JAMP(2619)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2823)+TMP_JAMP(2853)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2989)+(-1.000000000000000D+00)
@@ -18761,7 +18761,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
       JAMP(117,1) = ((0.000000000000000D+00,1.000000000000000D+00))
      $ *TMP_JAMP(589)+(-1.000000000000000D+00)*TMP_JAMP(1658)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(2137)
-     $ +(-1.000000000000000D+00)*AMP(1445)+AMP(1519)+TMP_JAMP(2596)
+     $ +(-1.000000000000000D+00)*AMP(1441)+AMP(1515)+TMP_JAMP(2596)
      $ +TMP_JAMP(2624)+TMP_JAMP(2633)+TMP_JAMP(2884)+TMP_JAMP(2908)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2928)+TMP_JAMP(2959)
       JAMP(118,1) = ((0.000000000000000D+00,1.000000000000000D+00))
@@ -18777,7 +18777,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ *TMP_JAMP(2858)+TMP_JAMP(2918)+(-1.000000000000000D+00)
      $ *TMP_JAMP(2959)
       JAMP(119,1) = (-1.000000000000000D+00)*TMP_JAMP(1041)+(
-     $ -1.000000000000000D+00)*AMP(1036)+TMP_JAMP(1608)
+     $ -1.000000000000000D+00)*AMP(1034)+TMP_JAMP(1608)
      $ +((0.000000000000000D+00,-1.000000000000000D+00))*TMP_JAMP(2148)
      $ +(-1.000000000000000D+00)*TMP_JAMP(2614)+TMP_JAMP(2635)
      $ +TMP_JAMP(2933)+TMP_JAMP(2992)+TMP_JAMP(3019)
@@ -18790,7 +18790,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
      $ +TMP_JAMP(531)+(-1.000000000000000D+00)*TMP_JAMP(1418)+(
      $ -1.000000000000000D+00)*TMP_JAMP(1673)+TMP_JAMP(1724)
      $ +((0.000000000000000D+00,1.000000000000000D+00))*TMP_JAMP(1797)
-     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1462)
+     $ +((0.000000000000000D+00,-1.000000000000000D+00))*AMP(1458)
      $ +TMP_JAMP(2619)+(-1.000000000000000D+00)*TMP_JAMP(2634)
      $ +TMP_JAMP(2670)+(-1.000000000000000D+00)*TMP_JAMP(2916)+(
      $ -1.000000000000000D+00)*TMP_JAMP(2992)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
index ef1bf58979..f0d38c2e5a 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index e8d8232be5..73a2d9596c 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005519866943359375 [0m
+[1;32mDEBUG: model prefixing  takes 0.00565791130065918 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.893 s
+1 processes with 1240 diagrams generated in 1.891 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.604 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.621 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.356 s
+ALOHA: aloha creates 5 routines in  0.351 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -204,9 +204,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 quit
 
-real	0m13.085s
-user	0m12.921s
-sys	0m0.106s
+real	0m13.161s
+user	0m12.961s
+sys	0m0.105s
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 2338d395b7..3fcb694ccd 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005399465560913086 [0m
+[1;32mDEBUG: model prefixing  takes 0.005532264709472656 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,10 +177,10 @@ Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gq_ttq 
 INFO: remove old information in CODEGEN_mad_gq_ttq 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
@@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f09e0eee5e0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8972f7cbe0> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -207,15 +207,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f09e0d723d0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8973165b80> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -224,19 +224,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 1 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
-Wrote files for 32 helas calls in 0.222 s
+Wrote files for 32 helas calls in 0.224 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates 2 routines in  0.147 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
@@ -260,7 +260,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -287,16 +287,16 @@ Hunk #2 succeeded at 162 (offset 19 lines).
 Hunk #3 succeeded at 247 (offset 26 lines).
 Hunk #4 succeeded at 281 (offset 32 lines).
 Hunk #5 succeeded at 326 (offset 32 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.957s
-user	0m1.708s
-sys	0m0.241s
+real	0m1.962s
+user	0m1.726s
+sys	0m0.237s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
index ef1bf58979..f0d38c2e5a 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index ad74707ae9..06d5354735 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005573272705078125 [0m
+[1;32mDEBUG: model prefixing  takes 0.0056154727935791016 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,14 +170,14 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.079 s
+8 processes with 40 diagrams generated in 0.080 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
@@ -190,28 +190,28 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1
 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 195][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
 Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
+ALOHA: aloha creates 2 routines in  0.146 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -225,9 +225,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 quit
 
-real	0m0.658s
-user	0m0.590s
-sys	0m0.062s
+real	0m0.655s
+user	0m0.595s
+sys	0m0.055s
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 9d96566eb2..645c0db954 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -135,22 +135,22 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_h
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. 
 Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates 1 routines in  0.062 s
@@ -163,9 +163,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 quit
 
-real	0m0.429s
+real	0m0.430s
 user	0m0.371s
-sys	0m0.051s
+sys	0m0.055s
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index bb2844f553..1d0d9e2a35 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00551915168762207 [0m
+[1;32mDEBUG: model prefixing  takes 0.005470752716064453 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -378,17 +378,17 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.844 s
+65 processes with 1119 diagrams generated in 1.856 s
 Total: 83 processes with 1202 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_pp_tt012j 
 INFO: remove old information in CODEGEN_mad_pp_tt012j 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
@@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED
 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 
 INFO: Creating files in directory P2_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b7c5490> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -506,15 +506,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6ef9700> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b404130> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -523,15 +523,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 1 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b7f0910> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -540,15 +540,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 2 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12bc2ff10> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -557,15 +557,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 3 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6ef9700> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b7f0910> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -574,15 +574,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 4 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6ef9700> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b7f04f0> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -591,15 +591,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 5 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6ef9700> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b78b310> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -608,15 +608,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 6 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6baa8b0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b78b250> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -625,15 +625,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 7 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12bc2ff10> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -642,15 +642,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 8 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef72491c0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b78b070> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -659,15 +659,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 9 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef72491c0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b78b310> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -676,15 +676,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 10 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6ef9700> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12bc2ff10> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -693,15 +693,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 11 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b3cf250> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -710,15 +710,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 12 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6f6c730> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b3a4ac0> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -727,15 +727,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 13 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b404130> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -744,15 +744,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 14 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b3cf250> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -761,15 +761,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 15 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6eba6d0> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b3cf250> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -778,15 +778,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 16 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ef6fb9160> [1;30m[export_v4.py at line 6226][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b404640> [1;30m[export_v4.py at line 6240][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -795,29 +795,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 17 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1862][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.298 s
-Wrote files for 810 helas calls in 3.297 s
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.304 s
+Wrote files for 810 helas calls in 3.574 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.339 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 5 routines in  0.355 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.315 s
+ALOHA: aloha creates 10 routines in  0.318 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -844,7 +844,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -1021,16 +1021,16 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m8.967s
-user	0m8.408s
-sys	0m0.506s
+real	0m9.272s
+user	0m8.475s
+sys	0m0.501s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
index ef1bf58979..f0d38c2e5a 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
@@ -2704,7 +2704,8 @@ def __new__(cls, finput=None, **opt):
                                 except Exception as error:
                                     import launch_plugin
                             target_class = launch_plugin.RunCard
-
+            elif issubclass(finput, RunCard):
+                target_class = finput
             else:
                 return None
 
@@ -2968,11 +2969,12 @@ def write(self, output_file, template=None, python_template=False,
         if python_template and not to_write:
             import string
             if self.blocks:
-                text = string.Template(text)
                 mapping = {}
                 for b in self.blocks:
                     mapping[b.name] =  b.get_template(self)
-                text = text.substitute(mapping)
+                    if "$%s" % b.name not in text:
+                        text += "\n$%s\n" % b.name
+                text = string.Template(text).substitute(mapping)
 
             if not self.list_parameter:
                 text = text % self

From 1fd1c4c5f493c21c3b271f980571db21c604bc7c Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 8 Nov 2023 20:51:09 +0100
Subject: [PATCH 04/14] [actions/gpucpp] TEMPORARILY disable testsuite on PRs
 (gh extension install actions/gh-actions-cache gives 'HTTP 403: API rate
 limit exceeded')

---
 .github/workflows/testsuite_allprocesses.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/testsuite_allprocesses.yml b/.github/workflows/testsuite_allprocesses.yml
index 7eaad09c9f..662284f944 100644
--- a/.github/workflows/testsuite_allprocesses.yml
+++ b/.github/workflows/testsuite_allprocesses.yml
@@ -15,8 +15,9 @@ on:
   workflow_dispatch:
 
   # Trigger the all-processes workflow for pull requests to master
-  pull_request:
-    branches: [ master ]
+  # TEMPORARILY disable these tests on PRs (gh extension install actions/gh-actions-cache gives 'HTTP 403: API rate limit exceeded')
+  ###pull_request:
+  ###  branches: [ master ]
 
   # Trigger the all-processes workflow when new changes to the workflow are pushed
   push:

From a4f748717dd57b3632caf5947e6fb48e22f2831a Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 10:11:16 +0100
Subject: [PATCH 05/14] [gpucpp] rerun 78 tput tests, with FPEs enabled in the
 check executable - usual failures in ggttg f/m and gqttq f (#783), no change
 in performance

---
 .../log_eemumu_mad_d_inl0_hrd0.txt            |  86 +++++++--------
 .../log_eemumu_mad_d_inl0_hrd0_bridge.txt     |  86 +++++++--------
 .../log_eemumu_mad_d_inl0_hrd0_common.txt     |  86 +++++++--------
 .../log_eemumu_mad_d_inl0_hrd0_curhst.txt     |  86 +++++++--------
 .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt     |  86 +++++++--------
 .../log_eemumu_mad_d_inl0_hrd1.txt            |  86 +++++++--------
 .../log_eemumu_mad_d_inl1_hrd0.txt            |  86 +++++++--------
 .../log_eemumu_mad_d_inl1_hrd1.txt            |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd0.txt            |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd0_bridge.txt     |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd0_common.txt     |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd0_curhst.txt     |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt     |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd1.txt            |  86 +++++++--------
 .../log_eemumu_mad_f_inl1_hrd0.txt            |  86 +++++++--------
 .../log_eemumu_mad_f_inl1_hrd1.txt            |  86 +++++++--------
 .../log_eemumu_mad_m_inl0_hrd0.txt            |  86 +++++++--------
 .../log_eemumu_mad_m_inl0_hrd1.txt            |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd0.txt              |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd0_bridge.txt       |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd0_common.txt       |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd0_curhst.txt       |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt       |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd1.txt              |  86 +++++++--------
 .../log_ggtt_mad_d_inl1_hrd0.txt              |  86 +++++++--------
 .../log_ggtt_mad_d_inl1_hrd1.txt              |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd0.txt              |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd0_bridge.txt       |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd0_common.txt       |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd0_curhst.txt       |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt       |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd1.txt              |  86 +++++++--------
 .../log_ggtt_mad_f_inl1_hrd0.txt              |  86 +++++++--------
 .../log_ggtt_mad_f_inl1_hrd1.txt              |  86 +++++++--------
 .../log_ggtt_mad_m_inl0_hrd0.txt              |  86 +++++++--------
 .../log_ggtt_mad_m_inl0_hrd1.txt              |  86 +++++++--------
 .../log_ggttg_mad_d_inl0_hrd0.txt             | 100 +++++++++---------
 .../log_ggttg_mad_d_inl0_hrd0_bridge.txt      | 100 +++++++++---------
 .../log_ggttg_mad_d_inl0_hrd1.txt             | 100 +++++++++---------
 .../log_ggttg_mad_f_inl0_hrd0.txt             |  36 +++----
 .../log_ggttg_mad_f_inl0_hrd0_bridge.txt      |  36 +++----
 .../log_ggttg_mad_f_inl0_hrd1.txt             |  36 +++----
 .../log_ggttg_mad_m_inl0_hrd0.txt             |  36 +++----
 .../log_ggttg_mad_m_inl0_hrd1.txt             |  36 +++----
 .../log_ggttgg_mad_d_inl0_hrd0.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0_common.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd1.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl1_hrd0.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl1_hrd1.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0_common.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd1.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl1_hrd0.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl1_hrd1.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_m_inl0_hrd0.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_m_inl0_hrd1.txt            | 100 +++++++++---------
 .../log_ggttggg_mad_d_inl0_hrd0.txt           | 100 +++++++++---------
 .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt    | 100 +++++++++---------
 .../log_ggttggg_mad_d_inl0_hrd1.txt           | 100 +++++++++---------
 .../log_ggttggg_mad_f_inl0_hrd0.txt           | 100 +++++++++---------
 .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt    | 100 +++++++++---------
 .../log_ggttggg_mad_f_inl0_hrd1.txt           | 100 +++++++++---------
 .../log_ggttggg_mad_m_inl0_hrd0.txt           | 100 +++++++++---------
 .../log_ggttggg_mad_m_inl0_hrd1.txt           | 100 +++++++++---------
 .../log_gqttq_mad_d_inl0_hrd0.txt             | 100 +++++++++---------
 .../log_gqttq_mad_d_inl0_hrd0_bridge.txt      | 100 +++++++++---------
 .../log_gqttq_mad_d_inl0_hrd1.txt             | 100 +++++++++---------
 .../log_gqttq_mad_f_inl0_hrd0.txt             |  92 ++++++++--------
 .../log_gqttq_mad_f_inl0_hrd0_bridge.txt      |  92 ++++++++--------
 .../log_gqttq_mad_f_inl0_hrd1.txt             |  92 ++++++++--------
 .../log_gqttq_mad_m_inl0_hrd0.txt             | 100 +++++++++---------
 .../log_gqttq_mad_m_inl0_hrd1.txt             | 100 +++++++++---------
 78 files changed, 3476 insertions(+), 3476 deletions(-)

diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index 4e0cc4f360..4f18003d70 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:00:16
+DATE: 2023-11-08_21:15:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.995135e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.942022e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.073010e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.482370e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.785159e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.963951e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.649523 sec
-     2,606,897,569      cycles                           #    2.955 GHz                    
-     4,039,165,920      instructions                     #    1.55  insn per cycle         
-       0.938736477 seconds time elapsed
+TOTAL       :     0.677103 sec
+     2,617,238,862      cycles                           #    2.883 GHz                    
+     4,033,048,225      instructions                     #    1.54  insn per cycle         
+       0.968798898 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.116390e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.309346e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.309346e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.115937e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.309320e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.309320e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.039128 sec
-    18,293,625,810      cycles                           #    3.027 GHz                    
-    44,037,997,118      instructions                     #    2.41  insn per cycle         
-       6.044375342 seconds time elapsed
+TOTAL       :     6.040810 sec
+    18,355,110,031      cycles                           #    3.037 GHz                    
+    44,036,146,715      instructions                     #    2.40  insn per cycle         
+       6.046149721 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.650519e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.159299e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.159299e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.614682e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.109953e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.109953e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.212186 sec
-    12,761,177,625      cycles                           #    3.027 GHz                    
-    31,004,602,670      instructions                     #    2.43  insn per cycle         
-       4.217391637 seconds time elapsed
+TOTAL       :     4.305087 sec
+    12,797,655,048      cycles                           #    2.970 GHz                    
+    31,002,550,325      instructions                     #    2.42  insn per cycle         
+       4.310429047 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.065360e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.886676e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.886676e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.058335e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.864325e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.864325e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.440327 sec
-    10,045,086,881      cycles                           #    2.916 GHz                    
-    19,380,193,658      instructions                     #    1.93  insn per cycle         
-       3.445672409 seconds time elapsed
+TOTAL       :     3.453382 sec
+    10,049,928,632      cycles                           #    2.906 GHz                    
+    19,377,949,384      instructions                     #    1.93  insn per cycle         
+       3.458678566 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.092180e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.955480e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.955480e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.139569e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.018506e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.018506e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.409304 sec
-     9,718,965,428      cycles                           #    2.848 GHz                    
-    18,998,332,681      instructions                     #    1.95  insn per cycle         
-       3.414677998 seconds time elapsed
+TOTAL       :     3.335195 sec
+     9,699,652,158      cycles                           #    2.904 GHz                    
+    18,994,942,569      instructions                     #    1.96  insn per cycle         
+       3.340655484 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.821062e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.417007e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.417007e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.800324e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.389989e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.389989e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.852694 sec
-     8,598,148,642      cycles                           #    2.229 GHz                    
-    15,740,848,417      instructions                     #    1.83  insn per cycle         
-       3.858015954 seconds time elapsed
+TOTAL       :     3.895197 sec
+     8,617,547,988      cycles                           #    2.211 GHz                    
+    15,739,004,417      instructions                     #    1.83  insn per cycle         
+       3.900641958 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
index a2a2220e0b..60971ecd43 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:34:09
+DATE: 2023-11-08_21:50:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.616160e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.542311e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.542311e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.736559e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.745060e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.745060e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.257075 sec
-     7,500,299,564      cycles                           #    3.000 GHz                    
-    13,128,281,558      instructions                     #    1.75  insn per cycle         
-       2.557069801 seconds time elapsed
+TOTAL       :     2.222962 sec
+     7,400,904,179      cycles                           #    2.991 GHz                    
+    13,138,789,289      instructions                     #    1.78  insn per cycle         
+       2.532867460 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.074156e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.251964e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.251964e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.078362e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.258405e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.258405e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.457469 sec
-    19,613,725,947      cycles                           #    3.035 GHz                    
-    44,260,538,354      instructions                     #    2.26  insn per cycle         
-       6.464068851 seconds time elapsed
+TOTAL       :     6.440995 sec
+    19,547,511,222      cycles                           #    3.033 GHz                    
+    44,263,760,517      instructions                     #    2.26  insn per cycle         
+       6.447379338 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.537992e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.980628e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.980628e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.568240e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.019266e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.019266e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.703362 sec
-    14,014,545,412      cycles                           #    2.976 GHz                    
-    31,843,317,256      instructions                     #    2.27  insn per cycle         
-       4.710044451 seconds time elapsed
+TOTAL       :     4.623039 sec
+    14,052,579,459      cycles                           #    3.037 GHz                    
+    31,844,500,266      instructions                     #    2.27  insn per cycle         
+       4.629479950 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.930954e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.630364e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.630364e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.863308e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.529884e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.529884e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.870178 sec
-    11,351,058,249      cycles                           #    2.929 GHz                    
-    20,737,271,008      instructions                     #    1.83  insn per cycle         
-       3.876822605 seconds time elapsed
+TOTAL       :     4.004138 sec
+    11,314,763,691      cycles                           #    2.822 GHz                    
+    20,739,815,252      instructions                     #    1.83  insn per cycle         
+       4.010963262 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.936889e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.651989e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.651989e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.961498e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.695721e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.695721e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.871998 sec
-    11,000,759,855      cycles                           #    2.837 GHz                    
-    20,365,657,381      instructions                     #    1.85  insn per cycle         
-       3.879015734 seconds time elapsed
+TOTAL       :     3.824549 sec
+    10,997,567,801      cycles                           #    2.871 GHz                    
+    20,355,988,697      instructions                     #    1.85  insn per cycle         
+       3.831152322 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.694377e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.207135e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.207135e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.664769e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.161936e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.161936e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.335020 sec
-     9,935,731,633      cycles                           #    2.289 GHz                    
-    16,882,918,411      instructions                     #    1.70  insn per cycle         
-       4.341683669 seconds time elapsed
+TOTAL       :     4.405341 sec
+     9,931,414,577      cycles                           #    2.252 GHz                    
+    16,884,401,146      instructions                     #    1.70  insn per cycle         
+       4.411803387 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
index dedce3e2ef..75e14339dc 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:47:12
+DATE: 2023-11-08_22:03:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.493472e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.526211e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.980085e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.826607e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.612761e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.962341e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     1.335531 sec
-     4,653,241,552      cycles                           #    2.971 GHz                    
-     7,232,975,239      instructions                     #    1.55  insn per cycle         
-       1.623039981 seconds time elapsed
+TOTAL       :     1.301469 sec
+     4,673,993,383      cycles                           #    3.055 GHz                    
+     7,270,667,887      instructions                     #    1.56  insn per cycle         
+       1.586588942 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.100587e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.292616e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.292616e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.143440e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.343019e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.343019e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     6.487751 sec
-    19,390,492,430      cycles                           #    2.987 GHz                    
-    44,137,957,280      instructions                     #    2.28  insn per cycle         
-       6.493082825 seconds time elapsed
+TOTAL       :     6.249393 sec
+    19,374,513,863      cycles                           #    3.098 GHz                    
+    44,137,807,645      instructions                     #    2.28  insn per cycle         
+       6.254447436 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.649039e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.157189e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.157189e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.651049e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.163460e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.163460e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.573606 sec
-    13,864,290,699      cycles                           #    3.029 GHz                    
-    31,004,021,041      instructions                     #    2.24  insn per cycle         
-       4.579072706 seconds time elapsed
+TOTAL       :     4.566003 sec
+    13,842,407,454      cycles                           #    3.029 GHz                    
+    31,004,270,304      instructions                     #    2.24  insn per cycle         
+       4.571383086 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.050077e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.865714e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.865714e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.085679e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.913536e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.913536e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.825144 sec
-    11,151,950,602      cycles                           #    2.912 GHz                    
-    19,279,192,444      instructions                     #    1.73  insn per cycle         
-       3.830421553 seconds time elapsed
+TOTAL       :     3.759234 sec
+    11,164,737,043      cycles                           #    2.967 GHz                    
+    19,280,466,147      instructions                     #    1.73  insn per cycle         
+       3.764531843 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.125943e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.996151e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.996151e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.157188e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.041275e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.041275e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.721741 sec
-    10,820,749,101      cycles                           #    2.904 GHz                    
-    18,706,645,976      instructions                     #    1.73  insn per cycle         
-       3.727088912 seconds time elapsed
+TOTAL       :     3.667981 sec
+    10,833,619,022      cycles                           #    2.950 GHz                    
+    18,695,779,485      instructions                     #    1.73  insn per cycle         
+       3.673091045 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.802766e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.399092e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.399092e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.852503e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.471081e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.471081e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.260983 sec
-     9,758,383,682      cycles                           #    2.288 GHz                    
-    15,439,422,037      instructions                     #    1.58  insn per cycle         
-       4.266311634 seconds time elapsed
+TOTAL       :     4.150740 sec
+     9,740,231,931      cycles                           #    2.344 GHz                    
+    15,438,395,407      instructions                     #    1.59  insn per cycle         
+       4.156220859 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
index 753c8feb62..c2852b0755 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:43:56
+DATE: 2023-11-08_22:00:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.492551e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.537742e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.994776e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.830407e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.634363e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.010779e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.978991 sec
-     3,581,699,122      cycles                           #    2.964 GHz                    
-     7,061,755,742      instructions                     #    1.97  insn per cycle         
-       1.265379690 seconds time elapsed
+TOTAL       :     0.985656 sec
+     3,531,228,063      cycles                           #    2.913 GHz                    
+     6,990,251,865      instructions                     #    1.98  insn per cycle         
+       1.270939740 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.108457e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.301315e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.301315e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.143065e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.342569e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.342569e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.081290 sec
-    18,339,334,415      cycles                           #    3.014 GHz                    
-    44,033,842,254      instructions                     #    2.40  insn per cycle         
-       6.086519540 seconds time elapsed
+TOTAL       :     5.897072 sec
+    18,280,833,177      cycles                           #    3.098 GHz                    
+    44,034,372,908      instructions                     #    2.41  insn per cycle         
+       5.902241793 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.647910e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.158230e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.158230e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.647739e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.157991e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.157991e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.219825 sec
-    12,790,482,904      cycles                           #    3.028 GHz                    
-    31,000,190,511      instructions                     #    2.42  insn per cycle         
-       4.225042583 seconds time elapsed
+TOTAL       :     4.221863 sec
+    12,803,042,604      cycles                           #    3.036 GHz                    
+    31,005,296,735      instructions                     #    2.42  insn per cycle         
+       4.227230772 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.046562e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.846964e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.846964e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.083518e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.912332e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.912332e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.470466 sec
-    10,075,062,185      cycles                           #    2.899 GHz                    
-    19,376,808,574      instructions                     #    1.92  insn per cycle         
-       3.475725491 seconds time elapsed
+TOTAL       :     3.412904 sec
+    10,065,358,042      cycles                           #    2.945 GHz                    
+    19,377,556,628      instructions                     #    1.93  insn per cycle         
+       3.418078261 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.091991e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.948349e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.948349e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.178157e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.068476e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.068476e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.411832 sec
-     9,706,821,336      cycles                           #    2.841 GHz                    
-    18,993,945,887      instructions                     #    1.96  insn per cycle         
-       3.417093831 seconds time elapsed
+TOTAL       :     3.275616 sec
+     9,709,500,834      cycles                           #    2.960 GHz                    
+    18,994,586,612      instructions                     #    1.96  insn per cycle         
+       3.280821668 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.817313e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.417390e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.417390e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.874008e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.497430e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.497430e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.864825 sec
-     8,629,354,000      cycles                           #    2.231 GHz                    
-    15,737,585,107      instructions                     #    1.82  insn per cycle         
-       3.870285071 seconds time elapsed
+TOTAL       :     3.750555 sec
+     8,607,389,256      cycles                           #    2.292 GHz                    
+    15,737,632,725      instructions                     #    1.83  insn per cycle         
+       3.755880546 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
index 8472c31bea..6a5b6e889f 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:40:37
+DATE: 2023-11-08_21:57:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.065913e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.488032e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.905997e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.203248e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.569989e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.906875e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.876732 sec
-     6,299,612,348      cycles                           #    2.989 GHz                    
-    11,571,253,190      instructions                     #    1.84  insn per cycle         
-       2.164294467 seconds time elapsed
+TOTAL       :     1.845079 sec
+     6,274,121,781      cycles                           #    3.027 GHz                    
+    11,554,949,617      instructions                     #    1.84  insn per cycle         
+       2.129841068 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.111600e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.304742e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.304742e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.133729e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.330465e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.330465e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.064598 sec
-    18,297,128,822      cycles                           #    3.015 GHz                    
-    44,033,779,580      instructions                     #    2.41  insn per cycle         
-       6.069938342 seconds time elapsed
+TOTAL       :     5.944694 sec
+    18,288,311,212      cycles                           #    3.074 GHz                    
+    44,034,741,687      instructions                     #    2.41  insn per cycle         
+       5.950018785 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.622403e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.120612e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.120612e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.659128e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.174088e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.174088e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.285433 sec
-    12,790,120,071      cycles                           #    2.982 GHz                    
-    31,000,688,554      instructions                     #    2.42  insn per cycle         
-       4.290779048 seconds time elapsed
+TOTAL       :     4.192977 sec
+    12,790,691,952      cycles                           #    3.048 GHz                    
+    31,002,731,251      instructions                     #    2.42  insn per cycle         
+       4.198334883 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.044295e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.854365e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.854365e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.084534e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.927805e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.927805e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.476131 sec
-    10,066,944,453      cycles                           #    2.893 GHz                    
-    19,377,002,166      instructions                     #    1.92  insn per cycle         
-       3.481530813 seconds time elapsed
+TOTAL       :     3.410971 sec
+    10,102,470,059      cycles                           #    2.959 GHz                    
+    19,378,571,736      instructions                     #    1.92  insn per cycle         
+       3.416356813 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.095206e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.953285e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.953285e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.180416e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.077058e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.077058e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.401536 sec
-     9,758,102,764      cycles                           #    2.865 GHz                    
-    18,996,151,120      instructions                     #    1.95  insn per cycle         
-       3.406936941 seconds time elapsed
+TOTAL       :     3.272531 sec
+     9,723,824,348      cycles                           #    2.967 GHz                    
+    19,005,371,454      instructions                     #    1.95  insn per cycle         
+       3.277801420 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.814025e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.410019e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.410019e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.875765e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.503453e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.503453e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.870433 sec
-     8,615,604,376      cycles                           #    2.224 GHz                    
-    15,736,922,136      instructions                     #    1.83  insn per cycle         
-       3.875834680 seconds time elapsed
+TOTAL       :     3.745624 sec
+     8,623,946,797      cycles                           #    2.300 GHz                    
+    15,739,753,667      instructions                     #    1.83  insn per cycle         
+       3.750856873 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
index b542059ad1..3b69c80285 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:00:50
+DATE: 2023-11-08_21:15:46
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.000398e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.960570e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.110004e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.519106e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.841619e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.067099e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.647549 sec
-     2,611,748,045      cycles                           #    2.979 GHz                    
-     4,046,502,501      instructions                     #    1.55  insn per cycle         
-       0.933750268 seconds time elapsed
+TOTAL       :     0.661524 sec
+     2,624,385,702      cycles                           #    2.945 GHz                    
+     4,009,504,923      instructions                     #    1.53  insn per cycle         
+       0.953550123 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.159227e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.372064e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.372064e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.178868e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.397031e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.397031e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.833108 sec
-    17,445,226,847      cycles                           #    2.989 GHz                    
-    41,885,202,351      instructions                     #    2.40  insn per cycle         
-       5.838346819 seconds time elapsed
+TOTAL       :     5.737674 sec
+    17,431,892,883      cycles                           #    3.036 GHz                    
+    41,881,565,184      instructions                     #    2.40  insn per cycle         
+       5.743076445 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  392) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.682893e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.222491e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.222491e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.685142e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.222963e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.222963e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.142121 sec
-    12,470,632,862      cycles                           #    3.008 GHz                    
-    30,166,171,065      instructions                     #    2.42  insn per cycle         
-       4.147564686 seconds time elapsed
+TOTAL       :     4.136316 sec
+    12,482,235,541      cycles                           #    3.016 GHz                    
+    30,165,183,766      instructions                     #    2.42  insn per cycle         
+       4.141750487 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1611) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.069225e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.895121e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.895121e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.065221e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.894043e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.894043e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.437470 sec
-     9,952,077,094      cycles                           #    2.891 GHz                    
-    19,112,450,451      instructions                     #    1.92  insn per cycle         
-       3.442739539 seconds time elapsed
+TOTAL       :     3.443708 sec
+     9,960,024,892      cycles                           #    2.889 GHz                    
+    19,109,707,129      instructions                     #    1.92  insn per cycle         
+       3.449179794 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1930) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.130212e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.018241e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.018241e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.139235e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.013091e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.013091e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.352335 sec
-     9,644,260,853      cycles                           #    2.874 GHz                    
-    18,779,667,176      instructions                     #    1.95  insn per cycle         
-       3.357742942 seconds time elapsed
+TOTAL       :     3.337798 sec
+     9,694,110,840      cycles                           #    2.900 GHz                    
+    18,764,903,742      instructions                     #    1.94  insn per cycle         
+       3.343110507 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1661) (512y:  178) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.865497e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.495990e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.495990e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.864706e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.496201e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.496201e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.772482 sec
-     8,452,356,069      cycles                           #    2.238 GHz                    
-    15,617,271,494      instructions                     #    1.85  insn per cycle         
-       3.777813091 seconds time elapsed
+TOTAL       :     3.773287 sec
+     8,448,094,450      cycles                           #    2.236 GHz                    
+    15,614,366,385      instructions                     #    1.85  insn per cycle         
+       3.778658466 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  886) (512y:  156) (512z: 1239)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
index 9fba89aff3..abd8e16103 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:23:25
+DATE: 2023-11-08_21:39:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.483432e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.567049e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.058193e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.541150e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.656561e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.025623e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.676370 sec
-     2,703,741,341      cycles                           #    2.971 GHz                    
-     4,197,515,180      instructions                     #    1.55  insn per cycle         
-       0.967825669 seconds time elapsed
+TOTAL       :     0.677402 sec
+     2,672,042,758      cycles                           #    2.933 GHz                    
+     4,104,960,698      instructions                     #    1.54  insn per cycle         
+       0.969965661 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.672486e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.141310e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.141310e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.643045e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.106548e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.106548e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.163173 sec
-    12,692,329,334      cycles                           #    3.045 GHz                    
-    32,576,040,648      instructions                     #    2.57  insn per cycle         
-       4.168672183 seconds time elapsed
+TOTAL       :     4.237683 sec
+    12,698,973,738      cycles                           #    2.997 GHz                    
+    32,580,365,424      instructions                     #    2.57  insn per cycle         
+       4.243310096 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  296) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.116856e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.025219e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.025219e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.102523e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.004727e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.004727e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.372207 sec
-    10,267,724,267      cycles                           #    3.041 GHz                    
-    24,505,197,015      instructions                     #    2.39  insn per cycle         
-       3.377809241 seconds time elapsed
+TOTAL       :     3.394812 sec
+    10,279,599,861      cycles                           #    3.024 GHz                    
+    24,505,440,482      instructions                     #    2.38  insn per cycle         
+       3.400499086 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1251) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.304978e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.380785e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.380785e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.301834e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.372180e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.372180e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.125688 sec
-     9,128,103,141      cycles                           #    2.916 GHz                    
-    16,940,836,203      instructions                     #    1.86  insn per cycle         
-       3.131242434 seconds time elapsed
+TOTAL       :     3.131325 sec
+     9,114,816,336      cycles                           #    2.906 GHz                    
+    16,941,253,973      instructions                     #    1.86  insn per cycle         
+       3.136898880 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1631) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.298021e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.382509e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.382509e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.334227e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.444641e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.444641e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.144282 sec
-     8,899,696,508      cycles                           #    2.834 GHz                    
-    16,372,313,838      instructions                     #    1.84  insn per cycle         
-       3.149838418 seconds time elapsed
+TOTAL       :     3.093526 sec
+     8,877,539,414      cycles                           #    2.866 GHz                    
+    16,358,190,505      instructions                     #    1.84  insn per cycle         
+       3.099088246 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1370) (512y:  139) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.053092e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.845549e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.845549e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.978126e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.726122e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.726122e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.465226 sec
-     7,910,184,141      cycles                           #    2.280 GHz                    
-    14,591,740,895      instructions                     #    1.84  insn per cycle         
-       3.470686114 seconds time elapsed
+TOTAL       :     3.588578 sec
+     7,927,907,472      cycles                           #    2.207 GHz                    
+    14,594,253,089      instructions                     #    1.84  insn per cycle         
+       3.594362581 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1015) (512y:  158) (512z:  955)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
index 9b85799057..d14dcc2cec 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:23:55
+DATE: 2023-11-08_21:40:11
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.480686e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.569964e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.063993e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.548142e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.673863e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.063444e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.677772 sec
-     2,691,282,086      cycles                           #    2.960 GHz                    
-     4,219,338,579      instructions                     #    1.57  insn per cycle         
-       0.971577356 seconds time elapsed
+TOTAL       :     0.673757 sec
+     2,682,929,459      cycles                           #    2.958 GHz                    
+     4,116,085,529      instructions                     #    1.53  insn per cycle         
+       0.967020710 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.182406e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.087943e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.087943e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.187961e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.086286e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.086286e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.286151 sec
-     9,910,806,255      cycles                           #    3.012 GHz                    
-    25,456,031,111      instructions                     #    2.57  insn per cycle         
-       3.291763573 seconds time elapsed
+TOTAL       :     3.278046 sec
+     9,891,835,516      cycles                           #    3.013 GHz                    
+    25,457,241,379      instructions                     #    2.57  insn per cycle         
+       3.283538395 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  249) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.467752e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.800434e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.800434e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.461475e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.800212e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.800212e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.950518 sec
-     8,946,482,743      cycles                           #    3.027 GHz                    
-    21,514,123,834      instructions                     #    2.40  insn per cycle         
-       2.956056552 seconds time elapsed
+TOTAL       :     2.961448 sec
+     8,958,054,464      cycles                           #    3.020 GHz                    
+    21,514,605,384      instructions                     #    2.40  insn per cycle         
+       2.967091806 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1119) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.464134e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.723435e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.723435e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.449114e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.718886e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.718886e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.952533 sec
-     8,633,003,733      cycles                           #    2.920 GHz                    
-    15,829,431,121      instructions                     #    1.83  insn per cycle         
-       2.958100358 seconds time elapsed
+TOTAL       :     2.969121 sec
+     8,647,101,919      cycles                           #    2.908 GHz                    
+    15,830,093,651      instructions                     #    1.83  insn per cycle         
+       2.974697377 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1494) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.533505e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.859681e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.859681e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.514280e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.825562e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.825562e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.876122 sec
-     8,428,640,196      cycles                           #    2.926 GHz                    
-    15,527,735,744      instructions                     #    1.84  insn per cycle         
-       2.881608685 seconds time elapsed
+TOTAL       :     2.898480 sec
+     8,435,230,503      cycles                           #    2.906 GHz                    
+    15,528,950,884      instructions                     #    1.84  insn per cycle         
+       2.904204103 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1268) (512y:  139) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.128966e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.008830e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.008830e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.166244e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.072345e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.072345e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.361119 sec
-     7,560,312,259      cycles                           #    2.246 GHz                    
-    14,293,668,051      instructions                     #    1.89  insn per cycle         
-       3.366622669 seconds time elapsed
+TOTAL       :     3.304157 sec
+     7,572,571,500      cycles                           #    2.289 GHz                    
+    14,293,792,931      instructions                     #    1.89  insn per cycle         
+       3.309751939 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1041) (512y:  164) (512z:  874)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index 46e803358f..cfc01e370f 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:01:23
+DATE: 2023-11-08_21:16:19
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.626199e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.328475e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.281681e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.506984e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.290770e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.275463e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.560646 sec
-     2,313,886,918      cycles                           #    2.957 GHz                    
-     3,567,705,327      instructions                     #    1.54  insn per cycle         
-       0.840116151 seconds time elapsed
+TOTAL       :     0.565965 sec
+     2,321,819,505      cycles                           #    2.946 GHz                    
+     3,610,558,250      instructions                     #    1.56  insn per cycle         
+       0.846354753 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.146010e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.358105e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.358105e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.127208e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.335415e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.335415e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.851033 sec
-    17,813,996,987      cycles                           #    3.043 GHz                    
-    43,616,814,202      instructions                     #    2.45  insn per cycle         
-       5.856069183 seconds time elapsed
+TOTAL       :     5.947738 sec
+    17,831,603,454      cycles                           #    2.997 GHz                    
+    43,615,812,813      instructions                     #    2.45  insn per cycle         
+       5.952849241 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.343466e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.599751e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.599751e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.344868e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.581929e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.581929e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.048613 sec
-     9,276,606,540      cycles                           #    3.040 GHz                    
-    21,930,294,042      instructions                     #    2.36  insn per cycle         
-       3.053688884 seconds time elapsed
+TOTAL       :     3.049781 sec
+     9,255,993,248      cycles                           #    3.030 GHz                    
+    21,926,767,970      instructions                     #    2.37  insn per cycle         
+       3.055067484 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.523694e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.872956e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.872956e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.528612e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.886098e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.886098e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.845518 sec
-     8,308,772,789      cycles                           #    2.916 GHz                    
-    15,593,301,532      instructions                     #    1.88  insn per cycle         
-       2.850623438 seconds time elapsed
+TOTAL       :     2.841538 sec
+     8,310,122,274      cycles                           #    2.920 GHz                    
+    15,590,852,784      instructions                     #    1.88  insn per cycle         
+       2.846613446 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.489948e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.840461e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.840461e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.544975e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.933439e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.933439e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.887357 sec
-     8,231,785,355      cycles                           #    2.847 GHz                    
-    15,437,944,905      instructions                     #    1.88  insn per cycle         
-       2.892363682 seconds time elapsed
+TOTAL       :     2.829740 sec
+     8,228,769,997      cycles                           #    2.904 GHz                    
+    15,439,791,314      instructions                     #    1.88  insn per cycle         
+       2.834839900 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.580760e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.973673e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.973673e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.468064e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.774733e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.774733e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.796324 sec
-     6,629,287,981      cycles                           #    2.367 GHz                    
-    12,873,018,117      instructions                     #    1.94  insn per cycle         
-       2.801456274 seconds time elapsed
+TOTAL       :     2.920266 sec
+     6,654,443,055      cycles                           #    2.276 GHz                    
+    12,870,591,658      instructions                     #    1.93  insn per cycle         
+       2.925460933 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
index a12ca3b41d..b89c0950e0 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:34:47
+DATE: 2023-11-08_21:51:16
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.243102e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.475352e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.475352e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.262139e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.843159e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.843159e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.676327 sec
-     5,681,132,328      cycles                           #    2.981 GHz                    
-    10,328,752,116      instructions                     #    1.82  insn per cycle         
-       1.962251346 seconds time elapsed
+TOTAL       :     1.672229 sec
+     5,680,712,756      cycles                           #    2.985 GHz                    
+    10,249,439,391      instructions                     #    1.80  insn per cycle         
+       1.960159582 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.117341e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.320071e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.320071e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.122888e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.326457e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.326457e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.103747 sec
-    18,503,457,384      cycles                           #    3.029 GHz                    
-    43,763,268,873      instructions                     #    2.37  insn per cycle         
-       6.109986471 seconds time elapsed
+TOTAL       :     6.070293 sec
+    18,467,877,178      cycles                           #    3.040 GHz                    
+    43,763,046,084      instructions                     #    2.37  insn per cycle         
+       6.076144883 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.169781e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.246790e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.246790e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.241087e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.353707e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.353707e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.406148 sec
-    10,026,239,155      cycles                           #    2.945 GHz                    
-    23,264,915,776      instructions                     #    2.32  insn per cycle         
-       3.412744895 seconds time elapsed
+TOTAL       :     3.295191 sec
+    10,020,961,358      cycles                           #    3.037 GHz                    
+    23,261,304,628      instructions                     #    2.32  insn per cycle         
+       3.301360149 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.376931e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.582524e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.582524e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.364429e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.552712e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.552712e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.133404 sec
-     9,115,108,969      cycles                           #    2.904 GHz                    
-    16,712,850,458      instructions                     #    1.83  insn per cycle         
-       3.139765331 seconds time elapsed
+TOTAL       :     3.146782 sec
+     9,058,696,000      cycles                           #    2.874 GHz                    
+    16,711,646,468      instructions                     #    1.84  insn per cycle         
+       3.152847146 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.412136e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.649634e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.649634e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.299176e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.448559e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.448559e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.093398 sec
-     9,015,171,302      cycles                           #    2.909 GHz                    
-    16,559,247,945      instructions                     #    1.84  insn per cycle         
-       3.099791137 seconds time elapsed
+TOTAL       :     3.242101 sec
+     8,995,544,368      cycles                           #    2.776 GHz                    
+    16,559,826,795      instructions                     #    1.84  insn per cycle         
+       3.248399630 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.406219e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.608241e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.608241e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.425438e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.624655e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.624655e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     3.106272 sec
-     7,475,444,541      cycles                           #    2.404 GHz                    
-    14,076,958,110      instructions                     #    1.88  insn per cycle         
-       3.112522018 seconds time elapsed
+TOTAL       :     3.082964 sec
+     7,440,102,740      cycles                           #    2.410 GHz                    
+    14,077,595,444      instructions                     #    1.89  insn per cycle         
+       3.089018136 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
index e12a7cff38..a9a0d75eb2 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:47:49
+DATE: 2023-11-08_22:04:10
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.309547e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.164321e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.211559e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.383746e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.209904e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.237350e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371863e-02 +- 3.269951e-06 )  GeV^0
-TOTAL       :     1.178788 sec
-     4,175,363,575      cycles                           #    2.986 GHz                    
-     6,687,157,832      instructions                     #    1.60  insn per cycle         
-       1.455561692 seconds time elapsed
+TOTAL       :     1.160269 sec
+     4,203,267,927      cycles                           #    3.027 GHz                    
+     6,686,907,403      instructions                     #    1.59  insn per cycle         
+       1.447760091 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.139229e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.352216e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.352216e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.159461e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.377089e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.377089e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270267e-06 )  GeV^0
-TOTAL       :     6.211284 sec
-    18,855,190,279      cycles                           #    3.034 GHz                    
-    43,795,517,542      instructions                     #    2.32  insn per cycle         
-       6.216374296 seconds time elapsed
+TOTAL       :     6.100201 sec
+    18,832,208,439      cycles                           #    3.085 GHz                    
+    43,796,080,670      instructions                     #    2.33  insn per cycle         
+       6.105246671 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.318674e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.546898e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.546898e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.360687e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.606052e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.606052e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270266e-06 )  GeV^0
-TOTAL       :     3.402195 sec
-    10,237,833,782      cycles                           #    3.006 GHz                    
-    22,007,212,368      instructions                     #    2.15  insn per cycle         
-       3.407333694 seconds time elapsed
+TOTAL       :     3.340864 sec
+    10,252,717,994      cycles                           #    3.065 GHz                    
+    22,009,397,675      instructions                     #    2.15  insn per cycle         
+       3.349625818 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.476676e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.816143e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.816143e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.544336e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.928692e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.928692e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.234448 sec
-     9,334,268,427      cycles                           #    2.883 GHz                    
-    15,503,242,414      instructions                     #    1.66  insn per cycle         
-       3.239539945 seconds time elapsed
+TOTAL       :     3.145870 sec
+     9,340,548,482      cycles                           #    2.966 GHz                    
+    15,504,284,674      instructions                     #    1.66  insn per cycle         
+       3.151101472 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.532354e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.931778e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.931778e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.556429e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.968460e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.968460e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.179353 sec
-     9,298,076,707      cycles                           #    2.921 GHz                    
-    15,144,691,612      instructions                     #    1.63  insn per cycle         
-       3.184641880 seconds time elapsed
+TOTAL       :     3.140902 sec
+     9,274,295,743      cycles                           #    2.952 GHz                    
+    15,151,601,553      instructions                     #    1.63  insn per cycle         
+       3.145942426 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.550309e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.928739e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.928739e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.615564e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.042980e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.042980e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.163394 sec
-     7,678,426,346      cycles                           #    2.424 GHz                    
-    12,579,409,911      instructions                     #    1.64  insn per cycle         
-       3.168501704 seconds time elapsed
+TOTAL       :     3.083778 sec
+     7,670,760,165      cycles                           #    2.484 GHz                    
+    12,580,664,280      instructions                     #    1.64  insn per cycle         
+       3.088953388 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
index ed97b2f8ed..e8e5add4c9 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:44:30
+DATE: 2023-11-08_22:00:55
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.311918e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.184761e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.263047e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.391545e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.217605e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.255851e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.849658 sec
-     3,163,783,620      cycles                           #    2.955 GHz                    
-     6,425,624,965      instructions                     #    2.03  insn per cycle         
-       1.127772989 seconds time elapsed
+TOTAL       :     0.834564 sec
+     3,199,482,421      cycles                           #    3.039 GHz                    
+     6,490,454,019      instructions                     #    2.03  insn per cycle         
+       1.111753408 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.132012e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.344208e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.344208e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.150543e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.366095e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.366095e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.993383 sec
-    18,094,070,839      cycles                           #    3.017 GHz                    
-    43,613,404,695      instructions                     #    2.41  insn per cycle         
-       5.998406050 seconds time elapsed
+TOTAL       :     5.833467 sec
+    17,826,844,076      cycles                           #    3.054 GHz                    
+    43,615,420,578      instructions                     #    2.45  insn per cycle         
+       5.838895279 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.281067e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.486158e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.486158e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.337314e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.571728e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.571728e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.130477 sec
-     9,257,197,715      cycles                           #    2.953 GHz                    
-    21,925,291,921      instructions                     #    2.37  insn per cycle         
-       3.135663717 seconds time elapsed
+TOTAL       :     3.054886 sec
+     9,243,837,324      cycles                           #    3.022 GHz                    
+    21,925,827,754      instructions                     #    2.37  insn per cycle         
+       3.060063052 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.526300e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.881905e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.881905e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.568595e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.965452e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.965452e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.846007 sec
-     8,323,404,187      cycles                           #    2.920 GHz                    
-    15,589,367,643      instructions                     #    1.87  insn per cycle         
-       2.851124263 seconds time elapsed
+TOTAL       :     2.797209 sec
+     8,337,217,151      cycles                           #    2.976 GHz                    
+    15,590,584,627      instructions                     #    1.87  insn per cycle         
+       2.802297250 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.559394e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.951403e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.951403e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.613887e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.042160e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.042160e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.815665 sec
-     8,248,875,592      cycles                           #    2.925 GHz                    
-    15,439,478,624      instructions                     #    1.87  insn per cycle         
-       2.820889860 seconds time elapsed
+TOTAL       :     2.753850 sec
+     8,236,246,865      cycles                           #    2.988 GHz                    
+    15,440,580,051      instructions                     #    1.87  insn per cycle         
+       2.758988038 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.553964e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.948928e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.948928e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.649804e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.085948e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.085948e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.827281 sec
-     6,687,814,053      cycles                           #    2.363 GHz                    
-    12,869,763,437      instructions                     #    1.92  insn per cycle         
-       2.832592565 seconds time elapsed
+TOTAL       :     2.730536 sec
+     6,628,841,045      cycles                           #    2.424 GHz                    
+    12,869,136,387      instructions                     #    1.94  insn per cycle         
+       2.735524791 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
index c7d745ef4d..4353a0323c 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:41:12
+DATE: 2023-11-08_21:57:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.077097e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.138341e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.120075e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.439872e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.182276e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.152259e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.480161 sec
-     5,077,584,264      cycles                           #    2.967 GHz                    
-     9,258,149,444      instructions                     #    1.82  insn per cycle         
-       1.768271684 seconds time elapsed
+TOTAL       :     1.433462 sec
+     5,039,023,930      cycles                           #    3.052 GHz                    
+     9,234,566,396      instructions                     #    1.83  insn per cycle         
+       1.710073871 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.142005e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.354012e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.354012e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.165155e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.381805e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.381805e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.874856 sec
-    17,835,700,462      cycles                           #    3.034 GHz                    
-    43,613,540,806      instructions                     #    2.45  insn per cycle         
-       5.879931479 seconds time elapsed
+TOTAL       :     5.755055 sec
+    17,830,794,091      cycles                           #    3.096 GHz                    
+    43,613,836,777      instructions                     #    2.45  insn per cycle         
+       5.760227416 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.282759e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.491220e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.491220e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.340707e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.569922e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.569922e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.129132 sec
-     9,269,728,355      cycles                           #    2.963 GHz                    
-    21,928,484,188      instructions                     #    2.37  insn per cycle         
-       3.134244707 seconds time elapsed
+TOTAL       :     3.052308 sec
+     9,235,069,524      cycles                           #    3.022 GHz                    
+    21,925,950,370      instructions                     #    2.37  insn per cycle         
+       3.057391403 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.516560e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.868004e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.868004e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.565429e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.942662e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.942662e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.857533 sec
-     8,336,241,805      cycles                           #    2.913 GHz                    
-    15,589,958,795      instructions                     #    1.87  insn per cycle         
-       2.862709487 seconds time elapsed
+TOTAL       :     2.806664 sec
+     8,327,245,678      cycles                           #    2.963 GHz                    
+    15,591,035,358      instructions                     #    1.87  insn per cycle         
+       2.811768123 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.536616e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.924197e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.924197e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.574877e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.971987e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.971987e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.838427 sec
-     8,267,692,084      cycles                           #    2.908 GHz                    
-    15,438,877,256      instructions                     #    1.87  insn per cycle         
-       2.843475918 seconds time elapsed
+TOTAL       :     2.795905 sec
+     8,237,659,186      cycles                           #    2.942 GHz                    
+    15,439,551,856      instructions                     #    1.87  insn per cycle         
+       2.800978610 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.539393e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.905150e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.905150e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.627739e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.061419e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.061419e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.843291 sec
-     6,667,785,493      cycles                           #    2.342 GHz                    
-    12,868,798,226      instructions                     #    1.93  insn per cycle         
-       2.848396098 seconds time elapsed
+TOTAL       :     2.749607 sec
+     6,653,390,801      cycles                           #    2.416 GHz                    
+    12,870,556,050      instructions                     #    1.93  insn per cycle         
+       2.754896991 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
index 2a5177092e..4a8bf7a45a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:01:53
+DATE: 2023-11-08_21:16:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.628396e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.344836e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.322116e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.504004e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.299164e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.301394e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.558495 sec
-     2,344,289,295      cycles                           #    2.966 GHz                    
-     3,579,154,611      instructions                     #    1.53  insn per cycle         
-       0.847997464 seconds time elapsed
+TOTAL       :     0.565713 sec
+     2,319,019,949      cycles                           #    2.949 GHz                    
+     3,628,185,594      instructions                     #    1.56  insn per cycle         
+       0.846311751 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.195436e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.435503e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.435503e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.206183e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.450008e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.450008e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.634613 sec
-    16,757,667,455      cycles                           #    2.972 GHz                    
-    41,375,848,460      instructions                     #    2.47  insn per cycle         
-       5.639688103 seconds time elapsed
+TOTAL       :     5.582273 sec
+    16,756,629,307      cycles                           #    2.999 GHz                    
+    41,373,009,702      instructions                     #    2.47  insn per cycle         
+       5.587382956 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.409189e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.740073e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.740073e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.401015e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.738811e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.738811e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.974456 sec
-     9,031,167,153      cycles                           #    3.032 GHz                    
-    21,234,204,961      instructions                     #    2.35  insn per cycle         
-       2.979655809 seconds time elapsed
+TOTAL       :     2.986422 sec
+     9,012,092,925      cycles                           #    3.013 GHz                    
+    21,229,937,185      instructions                     #    2.36  insn per cycle         
+       2.991621252 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1841) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.541260e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.926631e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.926631e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.541320e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.913153e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.913153e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.832126 sec
-     8,284,857,543      cycles                           #    2.922 GHz                    
-    15,430,300,133      instructions                     #    1.86  insn per cycle         
-       2.837298063 seconds time elapsed
+TOTAL       :     2.831515 sec
+     8,274,365,196      cycles                           #    2.917 GHz                    
+    15,424,948,763      instructions                     #    1.86  insn per cycle         
+       2.836960602 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2536) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.592912e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.031163e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.031163e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.599740e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.051139e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.051139e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.778473 sec
-     8,124,076,124      cycles                           #    2.921 GHz                    
-    15,242,043,085      instructions                     #    1.88  insn per cycle         
-       2.783650122 seconds time elapsed
+TOTAL       :     2.773779 sec
+     8,126,258,677      cycles                           #    2.925 GHz                    
+    15,238,451,861      instructions                     #    1.88  insn per cycle         
+       2.778950300 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2423) (512y:    8) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.583024e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.982786e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.982786e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.571238e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.958685e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.958685e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.793855 sec
-     6,612,725,918      cycles                           #    2.363 GHz                    
-    12,851,623,569      instructions                     #    1.94  insn per cycle         
-       2.799020549 seconds time elapsed
+TOTAL       :     2.804796 sec
+     6,629,701,677      cycles                           #    2.360 GHz                    
+    12,848,530,488      instructions                     #    1.94  insn per cycle         
+       2.809910943 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1705) (512y:   18) (512z: 1427)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
index b5507320b6..b8155a680e 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:24:23
+DATE: 2023-11-08_21:40:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.295762e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.181123e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.251991e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.302615e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.188065e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.274309e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.577748 sec
-     2,371,472,909      cycles                           #    2.938 GHz                    
-     3,662,215,838      instructions                     #    1.54  insn per cycle         
-       0.866645313 seconds time elapsed
+TOTAL       :     0.574549 sec
+     2,352,849,250      cycles                           #    2.917 GHz                    
+     3,649,350,219      instructions                     #    1.55  insn per cycle         
+       0.863978578 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.709669e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.230063e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.230063e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.686060e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.194010e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.194010e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     4.043238 sec
-    12,201,253,013      cycles                           #    3.016 GHz                    
-    32,520,928,331      instructions                     #    2.67  insn per cycle         
-       4.048480591 seconds time elapsed
+TOTAL       :     4.097911 sec
+    12,184,788,464      cycles                           #    2.970 GHz                    
+    32,521,623,255      instructions                     #    2.67  insn per cycle         
+       4.103328943 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  312) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.776736e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.688717e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.688717e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.770837e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.689962e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.689962e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.629894 sec
-     8,006,523,859      cycles                           #    3.039 GHz                    
-    18,689,561,969      instructions                     #    2.33  insn per cycle         
-       2.635155805 seconds time elapsed
+TOTAL       :     2.634890 sec
+     7,998,179,733      cycles                           #    3.030 GHz                    
+    18,690,180,922      instructions                     #    2.34  insn per cycle         
+       2.640235037 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1554) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.876319e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.776118e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.776118e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.861879e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.750654e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.750654e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.544972 sec
-     7,483,863,921      cycles                           #    2.935 GHz                    
-    14,252,784,118      instructions                     #    1.90  insn per cycle         
-       2.550249205 seconds time elapsed
+TOTAL       :     2.559375 sec
+     7,467,736,067      cycles                           #    2.913 GHz                    
+    14,255,217,150      instructions                     #    1.91  insn per cycle         
+       2.564904201 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2237) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.940665e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.960644e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.960644e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.908800e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.910304e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.910304e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.495422 sec
-     7,326,781,172      cycles                           #    2.931 GHz                    
-    13,945,833,508      instructions                     #    1.90  insn per cycle         
-       2.500698244 seconds time elapsed
+TOTAL       :     2.522982 sec
+     7,364,286,769      cycles                           #    2.913 GHz                    
+    13,952,625,236      instructions                     #    1.89  insn per cycle         
+       2.528348787 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2096) (512y:    3) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.636740e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.108198e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.108198e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.584257e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.006941e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.006941e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.746264 sec
-     6,527,138,912      cycles                           #    2.373 GHz                    
-    13,421,028,013      instructions                     #    2.06  insn per cycle         
-       2.751679406 seconds time elapsed
+TOTAL       :     2.801165 sec
+     6,529,127,011      cycles                           #    2.327 GHz                    
+    13,421,836,325      instructions                     #    2.06  insn per cycle         
+       2.806446897 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2071) (512y:    1) (512z: 1198)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
index b6c42e0895..385ce72d78 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:24:50
+DATE: 2023-11-08_21:41:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.300995e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.194789e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.295764e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.304320e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.197410e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.300141e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.573687 sec
-     2,396,122,888      cycles                           #    2.957 GHz                    
-     3,709,386,643      instructions                     #    1.55  insn per cycle         
-       0.867525381 seconds time elapsed
+TOTAL       :     0.574067 sec
+     2,385,415,994      cycles                           #    2.943 GHz                    
+     3,655,710,101      instructions                     #    1.53  insn per cycle         
+       0.868231647 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.274435e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.306451e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.306451e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.254695e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.267118e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.267118e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.128769 sec
-     9,423,056,878      cycles                           #    3.008 GHz                    
-    25,306,341,141      instructions                     #    2.69  insn per cycle         
-       3.134038482 seconds time elapsed
+TOTAL       :     3.154781 sec
+     9,423,263,848      cycles                           #    2.983 GHz                    
+    25,307,020,372      instructions                     #    2.69  insn per cycle         
+       3.160042496 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  263) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.099658e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.759584e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.759584e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.134634e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.819272e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.819272e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.397339 sec
-     7,201,211,606      cycles                           #    2.998 GHz                    
-    16,901,413,977      instructions                     #    2.35  insn per cycle         
-       2.402789017 seconds time elapsed
+TOTAL       :     2.372030 sec
+     7,183,608,233      cycles                           #    3.022 GHz                    
+    16,901,599,192      instructions                     #    2.35  insn per cycle         
+       2.377377295 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1359) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.019910e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.199492e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.199492e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.035295e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.215553e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.215553e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.443323 sec
-     7,147,435,963      cycles                           #    2.920 GHz                    
-    13,619,110,670      instructions                     #    1.91  insn per cycle         
-       2.448969091 seconds time elapsed
+TOTAL       :     2.433519 sec
+     7,141,153,744      cycles                           #    2.929 GHz                    
+    13,619,130,373      instructions                     #    1.91  insn per cycle         
+       2.438958453 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2060) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.050148e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.307582e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.307582e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.071324e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.326333e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.326333e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.423418 sec
-     7,082,396,314      cycles                           #    2.918 GHz                    
-    13,431,226,521      instructions                     #    1.90  insn per cycle         
-       2.429141482 seconds time elapsed
+TOTAL       :     2.408738 sec
+     7,063,825,257      cycles                           #    2.927 GHz                    
+    13,435,596,499      instructions                     #    1.90  insn per cycle         
+       2.414135887 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1945) (512y:    4) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.725279e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.338904e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.338904e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.750195e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.390595e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.390595e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.669392 sec
-     6,366,623,257      cycles                           #    2.381 GHz                    
-    13,153,230,984      instructions                     #    2.07  insn per cycle         
-       2.674848562 seconds time elapsed
+TOTAL       :     2.646969 sec
+     6,340,373,316      cycles                           #    2.391 GHz                    
+    13,154,077,274      instructions                     #    2.07  insn per cycle         
+       2.652485679 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2029) (512y:    1) (512z: 1083)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 40be1e0fe4..a176ffc4e4 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:02:23
+DATE: 2023-11-08_21:17:19
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.986561e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.920506e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.026737e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.486918e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.802792e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.976330e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.651585 sec
-     2,613,210,290      cycles                           #    2.977 GHz                    
-     4,026,633,947      instructions                     #    1.54  insn per cycle         
-       0.940304085 seconds time elapsed
+TOTAL       :     0.656484 sec
+     2,625,682,009      cycles                           #    2.960 GHz                    
+     4,099,364,380      instructions                     #    1.56  insn per cycle         
+       0.946865269 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.098312e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.283308e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.283308e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.091320e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.274850e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.274850e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.126587 sec
-    18,732,621,094      cycles                           #    3.056 GHz                    
-    44,288,636,649      instructions                     #    2.36  insn per cycle         
-       6.131702524 seconds time elapsed
+TOTAL       :     6.167326 sec
+    18,738,979,619      cycles                           #    3.037 GHz                    
+    44,287,346,211      instructions                     #    2.36  insn per cycle         
+       6.172563885 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  439) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.724748e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.279623e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.279623e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.716365e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.273883e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.273883e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.052368 sec
-    12,345,078,225      cycles                           #    3.044 GHz                    
-    30,962,385,061      instructions                     #    2.51  insn per cycle         
-       4.057665704 seconds time elapsed
+TOTAL       :     4.065766 sec
+    12,369,623,289      cycles                           #    3.039 GHz                    
+    30,960,892,415      instructions                     #    2.50  insn per cycle         
+       4.071137873 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1685) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.012805e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.801799e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.801799e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.040246e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.832671e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.832671e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.527503 sec
-    10,105,777,222      cycles                           #    2.861 GHz                    
-    19,402,091,411      instructions                     #    1.92  insn per cycle         
-       3.532885933 seconds time elapsed
+TOTAL       :     3.479287 sec
+    10,114,657,367      cycles                           #    2.903 GHz                    
+    19,400,067,612      instructions                     #    1.92  insn per cycle         
+       3.484811762 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2146) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.136223e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.011490e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.011490e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.136561e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.021650e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.021650e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.337554 sec
-     9,780,270,182      cycles                           #    2.927 GHz                    
-    18,984,447,401      instructions                     #    1.94  insn per cycle         
-       3.342834380 seconds time elapsed
+TOTAL       :     3.335937 sec
+     9,745,210,637      cycles                           #    2.917 GHz                    
+    18,969,865,366      instructions                     #    1.95  insn per cycle         
+       3.341324685 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1859) (512y:  188) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.916274e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.582982e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.582982e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.846714e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.476604e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.476604e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.678279 sec
-     8,374,553,290      cycles                           #    2.274 GHz                    
-    15,066,979,076      instructions                     #    1.80  insn per cycle         
-       3.683518796 seconds time elapsed
+TOTAL       :     3.810646 sec
+     8,364,453,052      cycles                           #    2.192 GHz                    
+    15,065,277,596      instructions                     #    1.80  insn per cycle         
+       3.816111336 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1023) (512y:  155) (512z: 1316)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
index d0448f95d2..257a2b14eb 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-03_19:02:57
+DATE: 2023-11-08_21:17:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.995389e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.942657e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.069355e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.517340e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.835074e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.047913e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.648218 sec
-     2,577,449,374      cycles                           #    2.937 GHz                    
-     3,930,119,139      instructions                     #    1.52  insn per cycle         
-       0.934838617 seconds time elapsed
+TOTAL       :     0.657459 sec
+     2,634,612,924      cycles                           #    2.971 GHz                    
+     4,038,430,114      instructions                     #    1.53  insn per cycle         
+       0.947276631 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.138539e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.340756e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.340756e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.135032e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.337803e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.337803e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.930169 sec
-    17,940,598,550      cycles                           #    3.023 GHz                    
-    42,539,439,563      instructions                     #    2.37  insn per cycle         
-       5.935391018 seconds time elapsed
+TOTAL       :     5.948882 sec
+    17,974,083,702      cycles                           #    3.020 GHz                    
+    42,538,758,836      instructions                     #    2.37  insn per cycle         
+       5.954247483 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.737380e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.320541e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.320541e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.746148e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.320939e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.320939e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.022351 sec
-    12,179,829,023      cycles                           #    3.025 GHz                    
-    30,269,422,152      instructions                     #    2.49  insn per cycle         
-       4.027705928 seconds time elapsed
+TOTAL       :     4.005000 sec
+    12,179,888,264      cycles                           #    3.038 GHz                    
+    30,267,022,025      instructions                     #    2.48  insn per cycle         
+       4.010444441 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1692) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.003006e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.791277e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.791277e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.065337e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.877404e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.877404e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.544763 sec
-    10,086,483,930      cycles                           #    2.843 GHz                    
-    19,285,075,836      instructions                     #    1.91  insn per cycle         
-       3.550049339 seconds time elapsed
+TOTAL       :     3.440250 sec
+    10,026,177,275      cycles                           #    2.911 GHz                    
+    19,281,771,933      instructions                     #    1.92  insn per cycle         
+       3.445652030 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2162) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.153713e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.048947e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.048947e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.165158e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.064737e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.064737e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.313722 sec
-     9,652,564,948      cycles                           #    2.909 GHz                    
-    18,773,850,855      instructions                     #    1.94  insn per cycle         
-       3.319022077 seconds time elapsed
+TOTAL       :     3.297369 sec
+     9,639,905,003      cycles                           #    2.920 GHz                    
+    18,781,958,033      instructions                     #    1.95  insn per cycle         
+       3.302769757 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1833) (512y:  191) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.911178e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.576380e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.576380e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.925761e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.602996e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.602996e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.691490 sec
-     8,274,258,282      cycles                           #    2.239 GHz                    
-    14,991,882,108      instructions                     #    1.81  insn per cycle         
-       3.696773496 seconds time elapsed
+TOTAL       :     3.664817 sec
+     8,281,446,223      cycles                           #    2.257 GHz                    
+    14,988,620,827      instructions                     #    1.81  insn per cycle         
+       3.670422107 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1020) (512y:  156) (512z: 1305)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index ecfe1f9032..06ab23436d 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:03:30
+DATE: 2023-11-08_21:18:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.269149e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.178306e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270483e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.051243e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.169781e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.269231e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.515028 sec
-     2,190,362,135      cycles                           #    2.945 GHz                    
-     3,134,430,746      instructions                     #    1.43  insn per cycle         
-       0.801320986 seconds time elapsed
+TOTAL       :     0.513968 sec
+     2,206,571,631      cycles                           #    2.965 GHz                    
+     3,147,975,302      instructions                     #    1.43  insn per cycle         
+       0.801145911 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.141790e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.204663e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.204663e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.149781e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.212668e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.212668e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.001947 sec
-    15,160,921,453      cycles                           #    3.029 GHz                    
-    38,440,320,018      instructions                     #    2.54  insn per cycle         
-       5.007262329 seconds time elapsed
+TOTAL       :     4.981998 sec
+    15,156,593,836      cycles                           #    3.040 GHz                    
+    38,437,072,823      instructions                     #    2.54  insn per cycle         
+       4.987299145 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.537912e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.729582e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.729582e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.640780e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.838553e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.838553e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.070180 sec
-     9,135,564,109      cycles                           #    2.971 GHz                    
-    24,595,068,911      instructions                     #    2.69  insn per cycle         
-       3.075510770 seconds time elapsed
+TOTAL       :     2.985566 sec
+     9,095,215,674      cycles                           #    3.042 GHz                    
+    24,591,174,592      instructions                     #    2.70  insn per cycle         
+       2.991001875 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.794659e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.298456e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.298456e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.834785e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.339543e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.339543e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.915155 sec
-     5,488,800,341      cycles                           #    2.860 GHz                    
-    11,269,289,809      instructions                     #    2.05  insn per cycle         
-       1.920562747 seconds time elapsed
+TOTAL       :     1.901490 sec
+     5,454,837,265      cycles                           #    2.862 GHz                    
+    11,265,546,477      instructions                     #    2.07  insn per cycle         
+       1.907039068 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.465243e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.099655e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.099655e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.372557e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.993390e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.993390e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.726047 sec
-     4,948,464,581      cycles                           #    2.859 GHz                    
-    10,575,268,094      instructions                     #    2.14  insn per cycle         
-       1.731560491 seconds time elapsed
+TOTAL       :     1.751887 sec
+     4,963,717,675      cycles                           #    2.826 GHz                    
+    10,572,023,161      instructions                     #    2.13  insn per cycle         
+       1.757527600 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.977744e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.204839e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.204839e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.939400e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.168716e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.168716e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.740172 sec
-     5,379,659,738      cycles                           #    1.960 GHz                    
-     7,808,789,832      instructions                     #    1.45  insn per cycle         
-       2.745493260 seconds time elapsed
+TOTAL       :     2.769882 sec
+     5,377,512,872      cycles                           #    1.939 GHz                    
+     7,806,286,911      instructions                     #    1.45  insn per cycle         
+       2.775553290 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
index dd2f256477..8de158cb65 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:35:20
+DATE: 2023-11-08_21:51:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.496633e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.880527e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.880527e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.592700e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.008872e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.008872e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.808083 sec
-     3,120,895,454      cycles                           #    2.971 GHz                    
-     4,726,889,577      instructions                     #    1.51  insn per cycle         
-       1.107972527 seconds time elapsed
+TOTAL       :     0.804684 sec
+     3,099,147,756      cycles                           #    2.967 GHz                    
+     4,823,816,385      instructions                     #    1.56  insn per cycle         
+       1.102344703 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.117962e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.179706e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.179706e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.051112e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.111963e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.111963e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.135527 sec
-    15,504,544,823      cycles                           #    3.016 GHz                    
-    38,497,224,440      instructions                     #    2.48  insn per cycle         
-       5.142229259 seconds time elapsed
+TOTAL       :     5.297825 sec
+    15,481,852,434      cycles                           #    2.919 GHz                    
+    38,496,050,546      instructions                     #    2.49  insn per cycle         
+       5.304382607 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.595756e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.790745e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.790745e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.421539e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.610351e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.610351e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.098715 sec
-     9,432,801,004      cycles                           #    3.038 GHz                    
-    24,773,895,780      instructions                     #    2.63  insn per cycle         
-       3.105439323 seconds time elapsed
+TOTAL       :     3.252273 sec
+     9,439,657,096      cycles                           #    2.897 GHz                    
+    24,775,783,847      instructions                     #    2.62  insn per cycle         
+       3.259008663 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.527781e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.981315e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.981315e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.465972e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.935898e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.935898e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.098555 sec
-     5,826,323,105      cycles                           #    2.789 GHz                    
-    11,554,423,664      instructions                     #    1.98  insn per cycle         
-       2.105206679 seconds time elapsed
+TOTAL       :     2.107608 sec
+     5,817,196,530      cycles                           #    2.752 GHz                    
+    11,552,661,145      instructions                     #    1.99  insn per cycle         
+       2.114326410 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.300396e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.893264e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.893264e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.009635e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.580924e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.580924e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.849117 sec
-     5,294,307,248      cycles                           #    2.854 GHz                    
-    10,856,382,305      instructions                     #    2.05  insn per cycle         
-       1.855861110 seconds time elapsed
+TOTAL       :     1.934696 sec
+     5,303,416,333      cycles                           #    2.735 GHz                    
+    10,861,487,391      instructions                     #    2.05  insn per cycle         
+       1.941424882 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.891057e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.111611e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.111611e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.701730e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.912869e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.912869e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.882235 sec
-     5,742,873,090      cycles                           #    1.988 GHz                    
-     8,048,787,968      instructions                     #    1.40  insn per cycle         
-       2.889049440 seconds time elapsed
+TOTAL       :     3.025583 sec
+     5,727,782,590      cycles                           #    1.894 GHz                    
+     8,052,158,492      instructions                     #    1.41  insn per cycle         
+       3.032424174 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
index 70c42f96ca..fc433be1ef 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:48:21
+DATE: 2023-11-08_22:04:42
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.579966e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154296e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270387e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.726172e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.159376e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.270269e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     0.619804 sec
-     2,500,171,473      cycles                           #    2.947 GHz                    
-     3,610,462,854      instructions                     #    1.44  insn per cycle         
-       0.906022247 seconds time elapsed
+TOTAL       :     0.626000 sec
+     2,413,951,090      cycles                           #    2.822 GHz                    
+     3,508,959,445      instructions                     #    1.45  insn per cycle         
+       0.913280230 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.141469e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.204103e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.204103e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.182990e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.247369e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.247369e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     5.061672 sec
-    15,345,417,554      cycles                           #    3.029 GHz                    
-    38,452,483,858      instructions                     #    2.51  insn per cycle         
-       5.067127392 seconds time elapsed
+TOTAL       :     4.967265 sec
+    15,332,653,861      cycles                           #    3.084 GHz                    
+    38,452,810,595      instructions                     #    2.51  insn per cycle         
+       4.972510854 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.594441e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.787517e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.787517e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.695457e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.898409e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.898409e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     3.081938 sec
-     9,306,122,505      cycles                           #    3.015 GHz                    
-    24,590,602,612      instructions                     #    2.64  insn per cycle         
-       3.087467598 seconds time elapsed
+TOTAL       :     2.999548 sec
+     9,281,583,975      cycles                           #    3.090 GHz                    
+    24,591,762,393      instructions                     #    2.65  insn per cycle         
+       3.004985897 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.780444e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.284766e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.284766e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.871319e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.385365e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.385365e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     1.978919 sec
-     5,659,108,727      cycles                           #    2.853 GHz                    
-    11,248,307,846      instructions                     #    1.99  insn per cycle         
-       1.984493875 seconds time elapsed
+TOTAL       :     1.950157 sec
+     5,690,984,261      cycles                           #    2.911 GHz                    
+    11,247,762,981      instructions                     #    1.98  insn per cycle         
+       1.955461495 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.409554e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.043503e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.043503e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.503413e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.137413e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.137413e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     1.801971 sec
-     5,131,678,035      cycles                           #    2.841 GHz                    
-    10,518,217,961      instructions                     #    2.05  insn per cycle         
-       1.807387516 seconds time elapsed
+TOTAL       :     1.776614 sec
+     5,148,876,403      cycles                           #    2.891 GHz                    
+    10,521,901,939      instructions                     #    2.04  insn per cycle         
+       1.781976606 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.952294e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.178919e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.178919e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.075607e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.312212e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.312212e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.820832 sec
-     5,565,619,645      cycles                           #    1.970 GHz                    
-     7,754,617,723      instructions                     #    1.39  insn per cycle         
-       2.826352548 seconds time elapsed
+TOTAL       :     2.736817 sec
+     5,563,466,882      cycles                           #    2.030 GHz                    
+     7,754,129,949      instructions                     #    1.39  insn per cycle         
+       2.742022793 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
index 4837b41444..f949e08a8e 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:45:01
+DATE: 2023-11-08_22:01:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.583777e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154968e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.271096e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.746837e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.161251e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.269946e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.557101 sec
-     2,322,977,037      cycles                           #    2.953 GHz                    
-     3,599,423,025      instructions                     #    1.55  insn per cycle         
-       0.843882316 seconds time elapsed
+TOTAL       :     0.546588 sec
+     2,339,106,527      cycles                           #    3.024 GHz                    
+     3,639,530,742      instructions                     #    1.56  insn per cycle         
+       0.830477401 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.134010e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.196717e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.196717e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.194821e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.259419e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.259419e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.022340 sec
-    15,161,844,495      cycles                           #    3.017 GHz                    
-    38,436,020,868      instructions                     #    2.54  insn per cycle         
-       5.028057319 seconds time elapsed
+TOTAL       :     4.881299 sec
+    15,162,215,504      cycles                           #    3.104 GHz                    
+    38,436,564,546      instructions                     #    2.54  insn per cycle         
+       4.886593937 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.611425e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.807723e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.807723e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.717533e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.921164e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.921164e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.009043 sec
-     9,092,248,013      cycles                           #    3.018 GHz                    
-    24,590,993,356      instructions                     #    2.70  insn per cycle         
-       3.014816078 seconds time elapsed
+TOTAL       :     2.924290 sec
+     9,098,563,572      cycles                           #    3.107 GHz                    
+    24,592,229,111      instructions                     #    2.70  insn per cycle         
+       2.929612410 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.765157e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.263695e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.263695e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.896966e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.423160e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.423160e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.924911 sec
-     5,492,799,049      cycles                           #    2.847 GHz                    
-    11,264,994,094      instructions                     #    2.05  insn per cycle         
-       1.930399853 seconds time elapsed
+TOTAL       :     1.883509 sec
+     5,473,701,924      cycles                           #    2.899 GHz                    
+    11,265,098,305      instructions                     #    2.06  insn per cycle         
+       1.888826353 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.461458e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.086226e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.086226e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.333944e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.936194e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.936194e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.728063 sec
-     4,951,669,022      cycles                           #    2.858 GHz                    
-    10,569,075,843      instructions                     #    2.13  insn per cycle         
-       1.733593807 seconds time elapsed
+TOTAL       :     1.759678 sec
+     4,959,739,230      cycles                           #    2.811 GHz                    
+    10,570,009,461      instructions                     #    2.13  insn per cycle         
+       1.765083600 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.938989e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.163796e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.163796e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.108089e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.344532e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.344532e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.768049 sec
-     5,404,539,268      cycles                           #    1.950 GHz                    
-     7,804,733,779      instructions                     #    1.44  insn per cycle         
-       2.773480694 seconds time elapsed
+TOTAL       :     2.655128 sec
+     5,388,561,520      cycles                           #    2.026 GHz                    
+     7,804,959,196      instructions                     #    1.45  insn per cycle         
+       2.660471194 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
index 04f32ac3bc..6c72f6887e 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:41:43
+DATE: 2023-11-08_21:58:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.845624e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154000e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.267501e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.993868e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.158186e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.266776e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.705622 sec
-     2,764,377,825      cycles                           #    2.955 GHz                    
-     4,322,445,800      instructions                     #    1.56  insn per cycle         
-       0.992638570 seconds time elapsed
+TOTAL       :     0.697399 sec
+     2,787,983,765      cycles                           #    3.019 GHz                    
+     4,369,945,413      instructions                     #    1.57  insn per cycle         
+       0.982292174 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.118266e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.179189e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.179189e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.151791e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.213814e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.213814e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.055003 sec
-    15,355,352,228      cycles                           #    3.035 GHz                    
-    38,436,037,499      instructions                     #    2.50  insn per cycle         
-       5.060369145 seconds time elapsed
+TOTAL       :     4.977336 sec
+    15,184,395,969      cycles                           #    3.048 GHz                    
+    38,438,963,256      instructions                     #    2.53  insn per cycle         
+       4.982648512 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.619308e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.814626e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.814626e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.705404e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.908313e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.908313e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.002993 sec
-     9,098,824,080      cycles                           #    3.025 GHz                    
-    24,590,228,698      instructions                     #    2.70  insn per cycle         
-       3.008485414 seconds time elapsed
+TOTAL       :     2.933004 sec
+     9,125,855,621      cycles                           #    3.107 GHz                    
+    24,590,801,711      instructions                     #    2.69  insn per cycle         
+       2.938291037 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.738465e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.252767e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.252767e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.720849e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.210353e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.210353e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.934521 sec
-     5,491,674,204      cycles                           #    2.833 GHz                    
-    11,265,170,941      instructions                     #    2.05  insn per cycle         
-       1.939950087 seconds time elapsed
+TOTAL       :     1.938623 sec
+     5,466,827,554      cycles                           #    2.814 GHz                    
+    11,265,438,862      instructions                     #    2.06  insn per cycle         
+       1.943823759 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.341479e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.957193e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.957193e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.635980e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.287954e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.287954e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.759206 sec
-     4,958,873,003      cycles                           #    2.811 GHz                    
-    10,570,272,367      instructions                     #    2.13  insn per cycle         
-       1.764825335 seconds time elapsed
+TOTAL       :     1.682323 sec
+     4,955,566,146      cycles                           #    2.937 GHz                    
+    10,571,524,775      instructions                     #    2.13  insn per cycle         
+       1.687724736 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.934828e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.158501e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.158501e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.091835e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.326386e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.326386e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.772888 sec
-     5,409,288,056      cycles                           #    1.948 GHz                    
-     7,806,084,388      instructions                     #    1.44  insn per cycle         
-       2.778257755 seconds time elapsed
+TOTAL       :     2.665788 sec
+     5,400,449,096      cycles                           #    2.023 GHz                    
+     7,805,014,579      instructions                     #    1.45  insn per cycle         
+       2.671129758 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
index 4e3b221e19..3a0f520dcc 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:03:57
+DATE: 2023-11-08_21:18:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.258167e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.174363e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.266024e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.048585e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.168286e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.265645e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.515882 sec
-     2,147,525,845      cycles                           #    2.877 GHz                    
-     3,086,933,024      instructions                     #    1.44  insn per cycle         
-       0.803849250 seconds time elapsed
+TOTAL       :     0.515938 sec
+     2,194,564,244      cycles                           #    2.948 GHz                    
+     3,170,767,882      instructions                     #    1.44  insn per cycle         
+       0.803319972 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.170531e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.234097e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.234097e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.145803e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.208726e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.208726e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.935479 sec
-    15,016,135,362      cycles                           #    3.040 GHz                    
-    40,166,123,209      instructions                     #    2.67  insn per cycle         
-       4.940913654 seconds time elapsed
+TOTAL       :     4.991935 sec
+    15,019,527,641      cycles                           #    3.006 GHz                    
+    40,165,389,576      instructions                     #    2.67  insn per cycle         
+       4.997467241 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.815308e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.035943e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.035943e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.795270e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.015877e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.015877e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.853658 sec
-     8,679,305,567      cycles                           #    3.037 GHz                    
-    23,688,803,932      instructions                     #    2.73  insn per cycle         
-       2.859362026 seconds time elapsed
+TOTAL       :     2.867596 sec
+     8,671,075,725      cycles                           #    3.019 GHz                    
+    23,683,669,849      instructions                     #    2.73  insn per cycle         
+       2.873212548 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2069) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.201194e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.599502e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.599502e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.180539e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.583447e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.583447e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.119971 sec
-     6,076,924,812      cycles                           #    2.860 GHz                    
-    13,078,281,182      instructions                     #    2.15  insn per cycle         
-       2.125352086 seconds time elapsed
+TOTAL       :     2.128793 sec
+     6,072,650,571      cycles                           #    2.846 GHz                    
+    13,074,915,373      instructions                     #    2.15  insn per cycle         
+       2.134316674 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2546) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.478450e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.920522e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.920522e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.449593e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.890564e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.890564e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.017570 sec
-     5,787,274,892      cycles                           #    2.862 GHz                    
-    12,336,105,279      instructions                     #    2.13  insn per cycle         
-       2.023012261 seconds time elapsed
+TOTAL       :     2.028925 sec
+     5,794,294,617      cycles                           #    2.851 GHz                    
+    12,335,132,296      instructions                     #    2.13  insn per cycle         
+       2.034385767 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2096) (512y:  294) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.519779e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.701184e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.701184e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.645486e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.838740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.838740e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.086221 sec
-     5,817,765,621      cycles                           #    1.888 GHz                    
-     9,621,068,231      instructions                     #    1.65  insn per cycle         
-       3.091564620 seconds time elapsed
+TOTAL       :     2.982084 sec
+     5,814,493,383      cycles                           #    1.947 GHz                    
+     9,613,724,456      instructions                     #    1.65  insn per cycle         
+       2.987600867 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1510) (512y:  209) (512z: 1971)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
index 3337c01ad4..1cbf67a236 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:25:16
+DATE: 2023-11-08_21:41:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.554755e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.155174e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.268743e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.595048e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.160670e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.269203e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.526687 sec
-     2,250,994,801      cycles                           #    2.926 GHz                    
-     3,097,737,524      instructions                     #    1.38  insn per cycle         
-       0.826717654 seconds time elapsed
+TOTAL       :     0.521954 sec
+     2,216,810,301      cycles                           #    2.935 GHz                    
+     3,140,499,783      instructions                     #    1.42  insn per cycle         
+       0.812101303 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.473532e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.556761e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.556761e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.505174e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.591402e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.591402e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.344783 sec
-    13,019,193,404      cycles                           #    2.993 GHz                    
-    34,405,663,599      instructions                     #    2.64  insn per cycle         
-       4.350365607 seconds time elapsed
+TOTAL       :     4.291146 sec
+    13,017,199,090      cycles                           #    3.030 GHz                    
+    34,406,598,887      instructions                     #    2.64  insn per cycle         
+       4.296733375 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  686) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.104680e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.249620e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.249620e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.106755e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.249963e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.249963e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.483866 sec
-    10,607,531,951      cycles                           #    3.041 GHz                    
-    24,022,392,993      instructions                     #    2.26  insn per cycle         
-       3.489298956 seconds time elapsed
+TOTAL       :     3.481603 sec
+    10,608,834,284      cycles                           #    3.044 GHz                    
+    24,023,421,035      instructions                     #    2.26  insn per cycle         
+       3.487384559 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2582) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.787875e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.125865e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.125865e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.756679e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.089717e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.089717e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.295291 sec
-     6,588,895,934      cycles                           #    2.865 GHz                    
-    12,413,954,044      instructions                     #    1.88  insn per cycle         
-       2.300926049 seconds time elapsed
+TOTAL       :     2.309669 sec
+     6,605,241,660      cycles                           #    2.854 GHz                    
+    12,414,642,119      instructions                     #    1.88  insn per cycle         
+       2.315374830 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3156) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.072251e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.445053e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.445053e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.883072e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.243446e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.243446e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.171777 sec
-     6,238,931,665      cycles                           #    2.866 GHz                    
-    11,585,660,605      instructions                     #    1.86  insn per cycle         
-       2.177410338 seconds time elapsed
+TOTAL       :     2.253913 sec
+     6,256,146,881      cycles                           #    2.770 GHz                    
+    11,588,754,266      instructions                     #    1.85  insn per cycle         
+       2.259602028 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2692) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.998110e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.229600e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.229600e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.014282e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.246391e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.246391e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.727363 sec
-     5,337,713,756      cycles                           #    1.954 GHz                    
-     9,308,309,205      instructions                     #    1.74  insn per cycle         
-       2.732896997 seconds time elapsed
+TOTAL       :     2.718420 sec
+     5,340,176,505      cycles                           #    1.961 GHz                    
+     9,309,276,244      instructions                     #    1.74  insn per cycle         
+       2.724177871 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:  282) (512z: 1958)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
index 64e33308d5..086ff92179 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:25:43
+DATE: 2023-11-08_21:42:00
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.571117e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.157677e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270835e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.601958e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.157408e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.268312e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.523342 sec
-     2,241,527,426      cycles                           #    2.944 GHz                    
-     3,209,964,665      instructions                     #    1.43  insn per cycle         
-       0.819917937 seconds time elapsed
+TOTAL       :     0.523179 sec
+     2,197,044,574      cycles                           #    2.904 GHz                    
+     3,180,010,549      instructions                     #    1.45  insn per cycle         
+       0.813333970 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.658099e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.754988e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.754988e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.551621e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.643503e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.643503e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.049112 sec
-    12,374,606,485      cycles                           #    3.053 GHz                    
-    35,058,016,337      instructions                     #    2.83  insn per cycle         
-       4.054549094 seconds time elapsed
+TOTAL       :     4.216286 sec
+    12,375,189,012      cycles                           #    2.932 GHz                    
+    35,060,083,206      instructions                     #    2.83  insn per cycle         
+       4.222169031 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  457) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.088523e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.231607e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.231607e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.067813e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.209694e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.209694e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.500477 sec
-    10,694,410,777      cycles                           #    3.051 GHz                    
-    23,099,336,289      instructions                     #    2.16  insn per cycle         
-       3.506159729 seconds time elapsed
+TOTAL       :     3.525507 sec
+    10,698,056,208      cycles                           #    3.031 GHz                    
+    23,100,081,560      instructions                     #    2.16  insn per cycle         
+       3.531306963 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.105721e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.492220e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.492220e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.118146e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.507530e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.507530e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.158641 sec
-     6,163,495,994      cycles                           #    2.849 GHz                    
-    11,969,488,967      instructions                     #    1.94  insn per cycle         
-       2.164367762 seconds time elapsed
+TOTAL       :     2.154521 sec
+     6,166,402,806      cycles                           #    2.856 GHz                    
+    11,969,983,926      instructions                     #    1.94  insn per cycle         
+       2.160177772 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2511) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.169198e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.571659e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.571659e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.238236e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.649069e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.649069e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.133549 sec
-     6,039,094,179      cycles                           #    2.824 GHz                    
-    11,144,077,781      instructions                     #    1.85  insn per cycle         
-       2.139096234 seconds time elapsed
+TOTAL       :     2.108281 sec
+     6,026,300,401      cycles                           #    2.854 GHz                    
+    11,141,738,024      instructions                     #    1.85  insn per cycle         
+       2.114031870 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2128) (512y:  174) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.003701e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.233597e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.233597e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.978977e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.208595e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.208595e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.726476 sec
-     5,224,063,612      cycles                           #    1.913 GHz                    
-     9,034,702,359      instructions                     #    1.73  insn per cycle         
-       2.732050023 seconds time elapsed
+TOTAL       :     2.742076 sec
+     5,240,960,370      cycles                           #    1.908 GHz                    
+     9,033,887,762      instructions                     #    1.72  insn per cycle         
+       2.747795404 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1651) (512y:  208) (512z: 1567)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 8d92c550fe..eb4d5419ee 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:04:25
+DATE: 2023-11-08_21:19:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.099342e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.699387e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.953526e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.037656e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.679710e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.950060e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.471293 sec
-     2,042,101,644      cycles                           #    2.948 GHz                    
-     2,946,816,826      instructions                     #    1.44  insn per cycle         
-       0.749881107 seconds time elapsed
+TOTAL       :     0.474624 sec
+     2,093,800,407      cycles                           #    2.948 GHz                    
+     2,971,543,250      instructions                     #    1.42  insn per cycle         
+       0.767958808 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.296642e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.371475e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.371475e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.294584e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.370694e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.370694e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.650028 sec
-    14,160,157,406      cycles                           #    3.043 GHz                    
-    38,398,040,352      instructions                     #    2.71  insn per cycle         
-       4.655270250 seconds time elapsed
+TOTAL       :     4.654140 sec
+    14,153,083,054      cycles                           #    3.038 GHz                    
+    38,392,852,878      instructions                     #    2.71  insn per cycle         
+       4.659227784 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.139917e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.562152e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.562152e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.142013e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.564188e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.564188e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.124632 sec
-     6,476,959,128      cycles                           #    3.042 GHz                    
-    15,834,256,517      instructions                     #    2.44  insn per cycle         
-       2.129768462 seconds time elapsed
+TOTAL       :     2.123842 sec
+     6,471,678,330      cycles                           #    3.041 GHz                    
+    15,829,749,383      instructions                     #    2.45  insn per cycle         
+       2.129132115 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.088663e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.043198e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.043198e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.403745e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.082517e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.082517e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.237397 sec
-     3,465,504,689      cycles                           #    2.794 GHz                    
-     7,611,207,779      instructions                     #    2.20  insn per cycle         
-       1.242588855 seconds time elapsed
+TOTAL       :     1.198427 sec
+     3,459,269,129      cycles                           #    2.876 GHz                    
+     7,606,844,485      instructions                     #    2.20  insn per cycle         
+       1.203597878 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.457008e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.096549e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.096549e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.005658e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.168806e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.168806e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.196326 sec
-     3,247,822,045      cycles                           #    2.704 GHz                    
-     7,220,309,293      instructions                     #    2.22  insn per cycle         
-       1.201704693 seconds time elapsed
+TOTAL       :     1.126360 sec
+     3,254,355,778      cycles                           #    2.878 GHz                    
+     7,215,715,994      instructions                     #    2.22  insn per cycle         
+       1.131662200 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.679715e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.389169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.389169e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.276060e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.101034e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.101034e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.658315 sec
-     3,062,288,257      cycles                           #    1.842 GHz                    
-     5,850,668,317      instructions                     #    1.91  insn per cycle         
-       1.663822965 seconds time elapsed
+TOTAL       :     1.528725 sec
+     3,068,447,705      cycles                           #    2.001 GHz                    
+     5,846,027,778      instructions                     #    1.91  insn per cycle         
+       1.534029615 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
index a1ebef89d2..459315b5db 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:35:48
+DATE: 2023-11-08_21:52:18
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.064201e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.498245e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.498245e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.229057e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.759945e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.759945e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.670260 sec
-     2,637,877,021      cycles                           #    2.942 GHz                    
-     4,088,256,570      instructions                     #    1.55  insn per cycle         
-       0.955124097 seconds time elapsed
+TOTAL       :     0.663839 sec
+     2,633,797,388      cycles                           #    2.963 GHz                    
+     4,071,573,226      instructions                     #    1.55  insn per cycle         
+       0.947283739 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.270912e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.344925e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.344925e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.280486e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.353996e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.353996e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.744982 sec
-    14,378,860,027      cycles                           #    3.027 GHz                    
-    38,435,472,086      instructions                     #    2.67  insn per cycle         
-       4.751370421 seconds time elapsed
+TOTAL       :     4.724775 sec
+    14,342,143,211      cycles                           #    3.033 GHz                    
+    38,438,250,053      instructions                     #    2.68  insn per cycle         
+       4.731136861 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.017460e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.422989e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.422989e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.072115e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.484269e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.484269e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.222354 sec
-     6,685,137,863      cycles                           #    3.001 GHz                    
-    16,109,819,565      instructions                     #    2.41  insn per cycle         
-       2.228696460 seconds time elapsed
+TOTAL       :     2.197377 sec
+     6,673,460,854      cycles                           #    3.029 GHz                    
+    16,110,044,412      instructions                     #    2.41  insn per cycle         
+       2.203637127 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.204872e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.057185e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.057185e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.156025e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.050843e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.050843e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.267912 sec
-     3,665,496,802      cycles                           #    2.878 GHz                    
-     7,843,464,752      instructions                     #    2.14  insn per cycle         
-       1.274414413 seconds time elapsed
+TOTAL       :     1.276703 sec
+     3,679,224,682      cycles                           #    2.872 GHz                    
+     7,844,733,298      instructions                     #    2.13  insn per cycle         
+       1.282950304 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.639653e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.116975e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.116975e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.848037e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.141843e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.141843e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.220373 sec
-     3,444,640,052      cycles                           #    2.810 GHz                    
-     7,451,522,975      instructions                     #    2.16  insn per cycle         
-       1.226715796 seconds time elapsed
+TOTAL       :     1.194194 sec
+     3,452,479,238      cycles                           #    2.878 GHz                    
+     7,452,050,539      instructions                     #    2.16  insn per cycle         
+       1.200346156 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.178040e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.972638e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.972638e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.221197e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.012402e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.012402e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.593853 sec
-     3,283,201,976      cycles                           #    2.053 GHz                    
-     6,099,788,393      instructions                     #    1.86  insn per cycle         
-       1.600161746 seconds time elapsed
+TOTAL       :     1.583142 sec
+     3,273,382,507      cycles                           #    2.061 GHz                    
+     6,100,795,667      instructions                     #    1.86  insn per cycle         
+       1.589319377 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
index b7fb0d6959..dcdda81950 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:48:49
+DATE: 2023-11-08_22:05:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.431152e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.624289e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.946132e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.826188e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.648877e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.951378e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079446e+00 +- 3.403306e-03 )  GeV^0
-TOTAL       :     0.564134 sec
-     2,302,613,621      cycles                           #    2.942 GHz                    
-     3,377,451,746      instructions                     #    1.47  insn per cycle         
-       0.841499880 seconds time elapsed
+TOTAL       :     0.557947 sec
+     2,332,705,336      cycles                           #    3.000 GHz                    
+     3,420,801,676      instructions                     #    1.47  insn per cycle         
+       0.836912289 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.289992e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.364715e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.364715e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.339471e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.416548e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.416548e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079573e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     4.718949 sec
-    14,318,249,819      cycles                           #    3.032 GHz                    
-    38,421,429,911      instructions                     #    2.68  insn per cycle         
-       4.724102129 seconds time elapsed
+TOTAL       :     4.618720 sec
+    14,313,897,069      cycles                           #    3.097 GHz                    
+    38,421,663,028      instructions                     #    2.68  insn per cycle         
+       4.623775275 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.077786e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.487595e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.487595e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.232630e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.661001e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.661001e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079572e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     2.204441 sec
-     6,639,814,735      cycles                           #    3.006 GHz                    
-    15,841,902,427      instructions                     #    2.39  insn per cycle         
-       2.209539727 seconds time elapsed
+TOTAL       :     2.140530 sec
+     6,636,885,571      cycles                           #    3.094 GHz                    
+    15,842,171,589      instructions                     #    2.39  insn per cycle         
+       2.145594820 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.307822e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.070999e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.070999e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.545031e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.097804e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.097804e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.265035 sec
-     3,649,285,785      cycles                           #    2.875 GHz                    
-     7,591,137,573      instructions                     #    2.08  insn per cycle         
-       1.270319196 seconds time elapsed
+TOTAL       :     1.233588 sec
+     3,635,079,459      cycles                           #    2.936 GHz                    
+     7,590,685,166      instructions                     #    2.09  insn per cycle         
+       1.238746125 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.974832e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.160037e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.160037e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.024875e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.195413e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.195413e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.191816 sec
-     3,426,519,284      cycles                           #    2.864 GHz                    
-     7,166,067,248      instructions                     #    2.09  insn per cycle         
-       1.197132868 seconds time elapsed
+TOTAL       :     1.160670 sec
+     3,429,453,475      cycles                           #    2.944 GHz                    
+     7,166,679,947      instructions                     #    2.09  insn per cycle         
+       1.165684786 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.265683e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.068951e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.068951e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.262300e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.049639e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.049639e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404208e-03 )  GeV^0
-TOTAL       :     1.584018 sec
-     3,241,188,093      cycles                           #    2.041 GHz                    
-     5,795,628,367      instructions                     #    1.79  insn per cycle         
-       1.589192883 seconds time elapsed
+TOTAL       :     1.582365 sec
+     3,235,924,413      cycles                           #    2.039 GHz                    
+     5,796,611,749      instructions                     #    1.79  insn per cycle         
+       1.587507042 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
index 30f4fadf92..831fd0fa9f 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:45:28
+DATE: 2023-11-08_22:01:51
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.447666e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.634082e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.951326e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.837632e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.654775e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.958238e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.513708 sec
-     2,149,338,807      cycles                           #    2.936 GHz                    
-     3,363,855,189      instructions                     #    1.57  insn per cycle         
-       0.790810409 seconds time elapsed
+TOTAL       :     0.503341 sec
+     2,173,332,424      cycles                           #    3.019 GHz                    
+     3,385,289,251      instructions                     #    1.56  insn per cycle         
+       0.779359289 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.247364e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.319306e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.319306e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.329232e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.405368e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.405368e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.751713 sec
-    14,161,394,696      cycles                           #    2.978 GHz                    
-    38,393,782,229      instructions                     #    2.71  insn per cycle         
-       4.756965371 seconds time elapsed
+TOTAL       :     4.586570 sec
+    14,159,897,717      cycles                           #    3.085 GHz                    
+    38,395,355,740      instructions                     #    2.71  insn per cycle         
+       4.591702989 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.102956e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.519127e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.519127e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.170239e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.592491e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.592491e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.140784 sec
-     6,476,072,518      cycles                           #    3.019 GHz                    
-    15,828,662,766      instructions                     #    2.44  insn per cycle         
-       2.146087935 seconds time elapsed
+TOTAL       :     2.112173 sec
+     6,472,075,786      cycles                           #    3.058 GHz                    
+    15,829,638,315      instructions                     #    2.45  insn per cycle         
+       2.117221818 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.357298e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.077430e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.077430e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.605537e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.104706e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.104706e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.205006 sec
-     3,468,184,099      cycles                           #    2.868 GHz                    
-     7,606,030,531      instructions                     #    2.19  insn per cycle         
-       1.210138102 seconds time elapsed
+TOTAL       :     1.174316 sec
+     3,462,364,333      cycles                           #    2.937 GHz                    
+     7,606,467,395      instructions                     #    2.20  insn per cycle         
+       1.179522425 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.559739e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.106426e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.106426e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.024286e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.190805e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.190805e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.182909 sec
-     3,252,386,286      cycles                           #    2.739 GHz                    
-     7,215,128,616      instructions                     #    2.22  insn per cycle         
-       1.188234183 seconds time elapsed
+TOTAL       :     1.105549 sec
+     3,254,375,411      cycles                           #    2.932 GHz                    
+     7,215,571,393      instructions                     #    2.22  insn per cycle         
+       1.110519445 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.332938e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.163555e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.163555e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.518662e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.361331e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.361331e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.514986 sec
-     3,076,222,583      cycles                           #    2.024 GHz                    
-     5,845,646,643      instructions                     #    1.90  insn per cycle         
-       1.520503790 seconds time elapsed
+TOTAL       :     1.478873 sec
+     3,068,230,484      cycles                           #    2.069 GHz                    
+     5,846,211,473      instructions                     #    1.91  insn per cycle         
+       1.484040601 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
index 65eed836f1..bb838a2196 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:42:11
+DATE: 2023-11-08_21:58:39
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.910755e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.623741e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.938668e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.130902e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.643491e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.939128e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.613295 sec
-     2,456,965,302      cycles                           #    2.952 GHz                    
-     3,803,211,416      instructions                     #    1.55  insn per cycle         
-       0.890835389 seconds time elapsed
+TOTAL       :     0.604908 sec
+     2,484,417,262      cycles                           #    3.021 GHz                    
+     3,852,149,899      instructions                     #    1.55  insn per cycle         
+       0.881326202 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.291712e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.365790e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.365790e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.328989e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.404078e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.404078e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.660750 sec
-    14,151,818,953      cycles                           #    3.034 GHz                    
-    38,392,284,342      instructions                     #    2.71  insn per cycle         
-       4.665929439 seconds time elapsed
+TOTAL       :     4.586292 sec
+    14,210,336,618      cycles                           #    3.096 GHz                    
+    38,392,847,533      instructions                     #    2.70  insn per cycle         
+       4.591549142 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.100691e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.531126e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.531126e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.239674e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.668279e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.668279e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.141262 sec
-     6,484,613,456      cycles                           #    3.022 GHz                    
-    15,829,197,800      instructions                     #    2.44  insn per cycle         
-       2.146554392 seconds time elapsed
+TOTAL       :     2.084661 sec
+     6,470,762,281      cycles                           #    3.098 GHz                    
+    15,829,570,536      instructions                     #    2.45  insn per cycle         
+       2.089664033 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.341999e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.073892e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073892e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.589227e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.103396e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.103396e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.207094 sec
-     3,469,517,910      cycles                           #    2.864 GHz                    
-     7,605,958,162      instructions                     #    2.19  insn per cycle         
-       1.212334488 seconds time elapsed
+TOTAL       :     1.175545 sec
+     3,466,544,418      cycles                           #    2.938 GHz                    
+     7,606,584,140      instructions                     #    2.19  insn per cycle         
+       1.180575347 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.000164e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.163047e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.163047e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.024662e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.193480e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.193480e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.132933 sec
-     3,264,238,503      cycles                           #    2.869 GHz                    
-     7,214,964,009      instructions                     #    2.21  insn per cycle         
-       1.138315941 seconds time elapsed
+TOTAL       :     1.105660 sec
+     3,258,740,690      cycles                           #    2.936 GHz                    
+     7,215,101,525      instructions                     #    2.21  insn per cycle         
+       1.110765672 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.339791e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.166023e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.166023e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.584208e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.436586e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.436586e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.514355 sec
-     3,071,490,694      cycles                           #    2.022 GHz                    
-     5,845,279,944      instructions                     #    1.90  insn per cycle         
-       1.519539150 seconds time elapsed
+TOTAL       :     1.465958 sec
+     3,064,168,908      cycles                           #    2.084 GHz                    
+     5,845,466,179      instructions                     #    1.91  insn per cycle         
+       1.471139277 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
index 06d8f7d09d..d667b6dbf4 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:04:48
+DATE: 2023-11-08_21:19:44
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.108032e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.751852e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.017010e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.049999e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.742417e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.025106e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.473084 sec
-     2,025,626,323      cycles                           #    2.920 GHz                    
-     2,923,341,053      instructions                     #    1.44  insn per cycle         
-       0.752440698 seconds time elapsed
+TOTAL       :     0.475958 sec
+     2,061,164,716      cycles                           #    2.907 GHz                    
+     2,917,299,650      instructions                     #    1.42  insn per cycle         
+       0.766837667 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.226197e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.296658e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.296658e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.217835e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.287538e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.287538e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.795639 sec
-    14,422,319,778      cycles                           #    3.005 GHz                    
-    39,889,404,210      instructions                     #    2.77  insn per cycle         
-       4.800761254 seconds time elapsed
+TOTAL       :     4.813699 sec
+    14,428,562,676      cycles                           #    2.998 GHz                    
+    39,888,508,384      instructions                     #    2.76  insn per cycle         
+       4.818824247 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  570) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.840353e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.410043e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.410043e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.957468e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.536679e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.536679e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     1.880181 sec
-     5,610,891,745      cycles                           #    2.978 GHz                    
-    15,305,908,167      instructions                     #    2.73  insn per cycle         
-       1.885354787 seconds time elapsed
+TOTAL       :     1.845039 sec
+     5,590,599,138      cycles                           #    3.023 GHz                    
+    15,299,534,426      instructions                     #    2.74  insn per cycle         
+       1.850198462 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2473) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.584020e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.270908e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.270908e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.651061e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.332537e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.332537e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.679496 sec
-     4,739,407,479      cycles                           #    2.814 GHz                    
-     9,752,382,085      instructions                     #    2.06  insn per cycle         
-       1.685063058 seconds time elapsed
+TOTAL       :     1.660892 sec
+     4,740,556,619      cycles                           #    2.846 GHz                    
+     9,747,822,441      instructions                     #    2.06  insn per cycle         
+       1.666191221 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3710) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.785300e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.495008e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.495008e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.778515e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.494686e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.494686e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.630325 sec
-     4,628,420,386      cycles                           #    2.831 GHz                    
-     9,343,264,044      instructions                     #    2.02  insn per cycle         
-       1.635531127 seconds time elapsed
+TOTAL       :     1.631450 sec
+     4,628,439,590      cycles                           #    2.829 GHz                    
+     9,339,816,116      instructions                     #    2.02  insn per cycle         
+       1.636603727 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3497) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.035393e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.577354e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.577354e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.981004e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.517698e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.517698e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.821625 sec
-     3,652,061,133      cycles                           #    2.000 GHz                    
-     7,049,331,376      instructions                     #    1.93  insn per cycle         
-       1.826875192 seconds time elapsed
+TOTAL       :     1.837853 sec
+     3,663,588,168      cycles                           #    1.989 GHz                    
+     7,045,799,249      instructions                     #    1.92  insn per cycle         
+       1.843187351 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2606) (512y:   12) (512z: 2221)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
index 430bbd2c8e..e94beeddac 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:26:11
+DATE: 2023-11-08_21:42:28
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.386931e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.620878e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.939459e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.362873e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.640443e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.957691e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.478570 sec
-     2,066,322,031      cycles                           #    2.937 GHz                    
-     2,939,169,205      instructions                     #    1.42  insn per cycle         
-       0.760998289 seconds time elapsed
+TOTAL       :     0.478743 sec
+     2,066,773,251      cycles                           #    2.940 GHz                    
+     2,882,191,672      instructions                     #    1.39  insn per cycle         
+       0.760603829 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.585659e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.679951e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.679951e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.571240e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.665961e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.665961e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.141107 sec
-    12,606,870,018      cycles                           #    3.041 GHz                    
-    34,392,677,682      instructions                     #    2.73  insn per cycle         
-       4.146310630 seconds time elapsed
+TOTAL       :     4.163303 sec
+    12,605,463,394      cycles                           #    3.025 GHz                    
+    34,393,608,512      instructions                     #    2.73  insn per cycle         
+       4.168641817 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  696) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.476247e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.957210e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.957210e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.401759e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.886488e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.886488e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.000613 sec
-     6,098,731,252      cycles                           #    3.041 GHz                    
-    14,873,462,613      instructions                     #    2.44  insn per cycle         
-       2.006051106 seconds time elapsed
+TOTAL       :     2.027469 sec
+     6,100,742,722      cycles                           #    3.002 GHz                    
+    14,874,619,740      instructions                     #    2.44  insn per cycle         
+       2.032997684 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3009) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.182448e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.992665e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.992665e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.152588e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.984648e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.984648e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.544245 sec
-     4,326,302,580      cycles                           #    2.793 GHz                    
-     9,041,454,033      instructions                     #    2.09  insn per cycle         
-       1.549495391 seconds time elapsed
+TOTAL       :     1.570348 sec
+     4,280,521,919      cycles                           #    2.743 GHz                    
+     9,042,316,644      instructions                     #    2.11  insn per cycle         
+       1.575934676 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4445) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.602793e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.504278e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.504278e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.548985e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.445828e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.445828e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.462983 sec
-     4,209,847,303      cycles                           #    2.868 GHz                    
-     8,675,528,842      instructions                     #    2.06  insn per cycle         
-       1.468300337 seconds time elapsed
+TOTAL       :     1.472831 sec
+     4,206,089,473      cycles                           #    2.847 GHz                    
+     8,677,889,358      instructions                     #    2.06  insn per cycle         
+       1.478375348 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4244) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.697162e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.177263e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.177263e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.660562e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.137441e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.137441e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.925379 sec
-     3,842,178,645      cycles                           #    1.991 GHz                    
-     7,819,452,293      instructions                     #    2.04  insn per cycle         
-       1.930845155 seconds time elapsed
+TOTAL       :     1.938115 sec
+     3,846,715,012      cycles                           #    1.980 GHz                    
+     7,820,097,651      instructions                     #    2.03  insn per cycle         
+       1.943482590 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4420) (512y:    0) (512z: 2556)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
index c32244c33c..a8a81cca05 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:26:34
+DATE: 2023-11-08_21:42:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.460575e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.684792e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.012555e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.468219e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.688670e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.018561e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.478960 sec
-     2,073,686,428      cycles                           #    2.952 GHz                    
-     2,982,309,893      instructions                     #    1.44  insn per cycle         
-       0.760465246 seconds time elapsed
+TOTAL       :     0.479145 sec
+     2,060,928,745      cycles                           #    2.937 GHz                    
+     2,943,965,642      instructions                     #    1.43  insn per cycle         
+       0.760902085 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.768420e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.879887e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.879887e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.752408e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.860428e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.860428e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     3.874951 sec
-    11,759,850,982      cycles                           #    3.031 GHz                    
-    35,129,174,459      instructions                     #    2.99  insn per cycle         
-       3.880406297 seconds time elapsed
+TOTAL       :     3.895863 sec
+    11,764,358,308      cycles                           #    3.017 GHz                    
+    35,130,105,613      instructions                     #    2.99  insn per cycle         
+       3.901121829 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  470) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.548911e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.058975e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.058975e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.491671e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.980976e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.980976e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     1.977553 sec
-     5,960,287,184      cycles                           #    3.008 GHz                    
-    14,484,169,544      instructions                     #    2.43  insn per cycle         
-       1.983134337 seconds time elapsed
+TOTAL       :     1.995272 sec
+     5,963,721,442      cycles                           #    2.982 GHz                    
+    14,483,479,258      instructions                     #    2.43  insn per cycle         
+       2.000909308 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2572) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.662372e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.600563e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.600563e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.606859e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.529662e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.529662e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.451994 sec
-     4,186,509,528      cycles                           #    2.874 GHz                    
-     8,887,826,504      instructions                     #    2.12  insn per cycle         
-       1.457581768 seconds time elapsed
+TOTAL       :     1.463863 sec
+     4,171,268,875      cycles                           #    2.840 GHz                    
+     8,887,248,415      instructions                     #    2.13  insn per cycle         
+       1.469508622 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3576) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.782199e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.721549e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.721549e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.334017e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.185528e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.185528e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.432127 sec
-     4,128,776,992      cycles                           #    2.874 GHz                    
-     8,424,271,434      instructions                     #    2.04  insn per cycle         
-       1.437420732 seconds time elapsed
+TOTAL       :     1.515911 sec
+     4,141,896,373      cycles                           #    2.724 GHz                    
+     8,425,434,947      instructions                     #    2.03  insn per cycle         
+       1.521361653 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3320) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.779314e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.273574e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.273574e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.735035e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.250427e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.250427e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.899022 sec
-     3,798,792,191      cycles                           #    1.996 GHz                    
-     7,712,429,012      instructions                     #    2.03  insn per cycle         
-       1.904382082 seconds time elapsed
+TOTAL       :     1.913707 sec
+     3,815,274,575      cycles                           #    1.989 GHz                    
+     7,713,047,642      instructions                     #    2.02  insn per cycle         
+       1.919181973 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3436) (512y:    0) (512z: 2108)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 4284e04c80..1d637e1269 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:05:13
+DATE: 2023-11-08_21:20:08
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.262595e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.173145e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.266137e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.064819e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.168761e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.265943e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.516288 sec
-     2,170,206,194      cycles                           #    2.914 GHz                    
-     3,121,753,700      instructions                     #    1.44  insn per cycle         
-       0.802206987 seconds time elapsed
+TOTAL       :     0.516845 sec
+     2,194,660,841      cycles                           #    2.941 GHz                    
+     3,161,612,621      instructions                     #    1.44  insn per cycle         
+       0.804942538 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.129811e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.193121e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.193121e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.076007e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.135159e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.135159e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.027744 sec
-    15,293,663,581      cycles                           #    3.040 GHz                    
-    38,642,438,156      instructions                     #    2.53  insn per cycle         
-       5.032856601 seconds time elapsed
+TOTAL       :     5.157074 sec
+    15,456,785,340      cycles                           #    2.995 GHz                    
+    38,638,875,955      instructions                     #    2.50  insn per cycle         
+       5.162652658 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  672) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.666972e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.869148e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.869148e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.689929e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.902707e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.902707e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.964411 sec
-     8,933,093,188      cycles                           #    3.009 GHz                    
-    24,243,353,502      instructions                     #    2.71  insn per cycle         
-       2.969821465 seconds time elapsed
+TOTAL       :     2.947066 sec
+     8,960,192,906      cycles                           #    3.035 GHz                    
+    24,239,204,206      instructions                     #    2.71  insn per cycle         
+       2.952599117 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2188) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.660709e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.167400e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.167400e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.870612e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.391820e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.391820e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.961588 sec
-     5,410,079,541      cycles                           #    2.752 GHz                    
-    11,291,080,205      instructions                     #    2.09  insn per cycle         
-       1.966921243 seconds time elapsed
+TOTAL       :     1.891319 sec
+     5,424,929,342      cycles                           #    2.862 GHz                    
+    11,287,630,140      instructions                     #    2.08  insn per cycle         
+       1.896741262 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2480) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.588007e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.231756e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.231756e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.626799e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.289896e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.289896e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.695283 sec
-     4,860,759,917      cycles                           #    2.859 GHz                    
-    10,541,284,808      instructions                     #    2.17  insn per cycle         
-       1.700590360 seconds time elapsed
+TOTAL       :     1.686295 sec
+     4,842,859,663      cycles                           #    2.864 GHz                    
+    10,535,885,470      instructions                     #    2.18  insn per cycle         
+       1.691658185 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2167) (512y:  148) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.107588e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.350535e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.350535e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.120532e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.365927e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.365927e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.656629 sec
-     5,204,386,075      cycles                           #    1.956 GHz                    
-     7,617,502,706      instructions                     #    1.46  insn per cycle         
-       2.661905103 seconds time elapsed
+TOTAL       :     2.650947 sec
+     5,210,620,634      cycles                           #    1.962 GHz                    
+     7,614,639,902      instructions                     #    1.46  insn per cycle         
+       2.656437650 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1633) (512y:  126) (512z: 1608)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
index 58d2d743b0..92e3c9f0b5 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-03_19:05:40
+DATE: 2023-11-08_21:20:35
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.265506e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.176728e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270375e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.066522e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.173508e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.273022e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.513169 sec
-     2,175,922,923      cycles                           #    2.936 GHz                    
-     3,154,957,492      instructions                     #    1.45  insn per cycle         
-       0.799013980 seconds time elapsed
+TOTAL       :     0.512769 sec
+     2,197,876,209      cycles                           #    2.961 GHz                    
+     3,170,940,757      instructions                     #    1.44  insn per cycle         
+       0.799563998 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.110999e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.171227e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.171227e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.111886e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.172848e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.172848e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.072155 sec
-    15,377,556,110      cycles                           #    3.029 GHz                    
-    40,435,905,161      instructions                     #    2.63  insn per cycle         
-       5.077406066 seconds time elapsed
+TOTAL       :     5.069953 sec
+    15,385,884,321      cycles                           #    3.032 GHz                    
+    40,433,272,287      instructions                     #    2.63  insn per cycle         
+       5.075349465 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.761885e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.974310e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.974310e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.654822e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.859127e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.859127e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.891901 sec
-     8,516,736,770      cycles                           #    2.941 GHz                    
-    23,273,421,536      instructions                     #    2.73  insn per cycle         
-       2.897134410 seconds time elapsed
+TOTAL       :     2.975229 sec
+     8,506,893,399      cycles                           #    2.855 GHz                    
+    23,270,886,855      instructions                     #    2.74  insn per cycle         
+       2.980696937 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2091) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.041812e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.416387e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.416387e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.053911e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.431363e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.431363e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.184891 sec
-     6,239,964,038      cycles                           #    2.850 GHz                    
-    12,976,938,369      instructions                     #    2.08  insn per cycle         
-       2.190210603 seconds time elapsed
+TOTAL       :     2.179721 sec
+     6,241,572,834      cycles                           #    2.857 GHz                    
+    12,973,482,438      instructions                     #    2.08  insn per cycle         
+       2.185137091 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2669) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.262419e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.673980e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.673980e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.331614e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.744905e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.744905e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.097286 sec
-     5,931,604,060      cycles                           #    2.822 GHz                    
-    12,254,844,972      instructions                     #    2.07  insn per cycle         
-       2.102596228 seconds time elapsed
+TOTAL       :     2.072194 sec
+     5,929,542,555      cycles                           #    2.855 GHz                    
+    12,251,825,862      instructions                     #    2.07  insn per cycle         
+       2.077717224 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2209) (512y:  296) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.636806e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.830983e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.830983e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.800727e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.013912e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.013912e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.989274 sec
-     5,599,763,733      cycles                           #    1.871 GHz                    
-     8,758,209,944      instructions                     #    1.56  insn per cycle         
-       2.994808333 seconds time elapsed
+TOTAL       :     2.863923 sec
+     5,611,513,288      cycles                           #    1.956 GHz                    
+     8,753,901,381      instructions                     #    1.56  insn per cycle         
+       2.869313331 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1490) (512y:  183) (512z: 1909)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index c973ded005..87df63c965 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:06:08
+DATE: 2023-11-08_21:21:03
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.987778e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.047089e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.059978e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.879738e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.041736e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.055795e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.462314 sec
-     1,969,733,176      cycles                           #    2.915 GHz                    
-     2,854,417,454      instructions                     #    1.45  insn per cycle         
-       0.732902295 seconds time elapsed
+TOTAL       :     0.461849 sec
+     1,973,375,466      cycles                           #    2.915 GHz                    
+     2,850,187,396      instructions                     #    1.44  insn per cycle         
+       0.733799311 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.125374e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.318187e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.329149e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.114902e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.320626e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.332328e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.595579 sec
-     2,446,683,532      cycles                           #    2.952 GHz                    
-     3,726,903,800      instructions                     #    1.52  insn per cycle         
-       0.888429467 seconds time elapsed
+TOTAL       :     0.597626 sec
+     2,460,714,562      cycles                           #    2.956 GHz                    
+     3,716,258,767      instructions                     #    1.51  insn per cycle         
+       0.892242937 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.543975e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.556543e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.556543e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.537254e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.549613e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.549613e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.463148 sec
-    19,697,684,289      cycles                           #    3.046 GHz                    
-    59,611,728,869      instructions                     #    3.03  insn per cycle         
-       6.467313414 seconds time elapsed
+TOTAL       :     6.480284 sec
+    19,731,245,814      cycles                           #    3.044 GHz                    
+    59,610,628,892      instructions                     #    3.02  insn per cycle         
+       6.484553626 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1466) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.806236e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.850408e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.850408e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.819525e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.864015e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.864015e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.430883 sec
-    10,361,092,942      cycles                           #    3.017 GHz                    
-    30,679,655,225      instructions                     #    2.96  insn per cycle         
-       3.435128458 seconds time elapsed
+TOTAL       :     3.421528 sec
+    10,361,656,121      cycles                           #    3.025 GHz                    
+    30,678,833,436      instructions                     #    2.96  insn per cycle         
+       3.425797412 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5153) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.723128e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.902993e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.902993e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.328413e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.498915e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.498915e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.707466 sec
-     4,879,146,362      cycles                           #    2.851 GHz                    
-    11,021,709,924      instructions                     #    2.26  insn per cycle         
-       1.711937944 seconds time elapsed
+TOTAL       :     1.779184 sec
+     4,885,070,909      cycles                           #    2.740 GHz                    
+    11,021,940,228      instructions                     #    2.26  insn per cycle         
+       1.783393950 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4467) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.083664e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.105516e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.105516e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.089421e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.111598e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111598e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.533989 sec
-     4,371,523,225      cycles                           #    2.843 GHz                    
-    10,299,869,041      instructions                     #    2.36  insn per cycle         
-       1.538284203 seconds time elapsed
+TOTAL       :     1.526514 sec
+     4,365,565,996      cycles                           #    2.854 GHz                    
+    10,298,805,774      instructions                     #    2.36  insn per cycle         
+       1.530732946 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4137) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.583252e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.691167e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.691167e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.324075e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.430754e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.430754e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.184881 sec
-     4,101,268,943      cycles                           #    1.874 GHz                    
-     5,846,549,953      instructions                     #    1.43  insn per cycle         
-       2.189162148 seconds time elapsed
+TOTAL       :     2.262206 sec
+     4,104,673,936      cycles                           #    1.812 GHz                    
+     5,846,278,322      instructions                     #    1.42  insn per cycle         
+       2.266456846 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1540) (512y:   95) (512z: 3466)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
index cc88ce6db1..a8aafca020 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:36:12
+DATE: 2023-11-08_21:52:42
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.617150e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.773641e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.773641e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.668584e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.838174e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.838174e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.490872 sec
-     2,070,161,118      cycles                           #    2.946 GHz                    
-     3,152,579,676      instructions                     #    1.52  insn per cycle         
-       0.759960652 seconds time elapsed
+TOTAL       :     0.491390 sec
+     2,056,116,630      cycles                           #    2.930 GHz                    
+     3,087,605,373      instructions                     #    1.50  insn per cycle         
+       0.760599439 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.687018e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.487518e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.487518e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.753470e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.636054e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.636054e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.832612 sec
-     3,193,307,533      cycles                           #    2.947 GHz                    
-     4,978,788,975      instructions                     #    1.56  insn per cycle         
-       1.143205796 seconds time elapsed
+TOTAL       :     0.817784 sec
+     3,130,594,447      cycles                           #    2.944 GHz                    
+     4,997,770,241      instructions                     #    1.60  insn per cycle         
+       1.126915791 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.529162e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.541866e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.541866e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.533314e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.546135e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.546135e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.507705 sec
-    19,736,202,639      cycles                           #    3.031 GHz                    
-    59,616,040,959      instructions                     #    3.02  insn per cycle         
-       6.512416242 seconds time elapsed
+TOTAL       :     6.496533 sec
+    19,730,935,453      cycles                           #    3.036 GHz                    
+    59,615,663,798      instructions                     #    3.02  insn per cycle         
+       6.500895427 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1466) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.815393e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.861165e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.861165e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.824473e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.869855e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.869855e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.431600 sec
-    10,398,990,181      cycles                           #    3.027 GHz                    
-    30,726,516,620      instructions                     #    2.95  insn per cycle         
-       3.436080496 seconds time elapsed
+TOTAL       :     3.425054 sec
+    10,403,336,159      cycles                           #    3.035 GHz                    
+    30,728,089,368      instructions                     #    2.95  insn per cycle         
+       3.429466512 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5153) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.253880e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.426870e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.426870e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.541398e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.724381e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.724381e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.802152 sec
-     4,928,997,803      cycles                           #    2.730 GHz                    
-    11,072,368,065      instructions                     #    2.25  insn per cycle         
-       1.806633331 seconds time elapsed
+TOTAL       :     1.747981 sec
+     4,923,635,172      cycles                           #    2.811 GHz                    
+    11,072,838,099      instructions                     #    2.25  insn per cycle         
+       1.752609449 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4467) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.076136e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.098656e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.098656e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.072827e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.095239e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.095239e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.553423 sec
-     4,411,400,335      cycles                           #    2.833 GHz                    
-    10,349,798,385      instructions                     #    2.35  insn per cycle         
-       1.557941492 seconds time elapsed
+TOTAL       :     1.557290 sec
+     4,408,906,008      cycles                           #    2.824 GHz                    
+    10,349,337,234      instructions                     #    2.35  insn per cycle         
+       1.561766662 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4137) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.266833e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.375233e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.375233e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.462789e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.573036e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.573036e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.287929 sec
-     4,148,582,308      cycles                           #    1.811 GHz                    
-     5,885,924,420      instructions                     #    1.42  insn per cycle         
-       2.292472050 seconds time elapsed
+TOTAL       :     2.226828 sec
+     4,140,433,235      cycles                           #    1.856 GHz                    
+     5,883,947,133      instructions                     #    1.42  insn per cycle         
+       2.231231918 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1540) (512y:   95) (512z: 3466)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
index 890a9e444f..2485d7fbb8 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:06:37
+DATE: 2023-11-08_21:21:32
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.934806e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.040123e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.052620e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.914793e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.044227e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.057322e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.460430 sec
-     1,973,324,046      cycles                           #    2.928 GHz                    
-     2,840,856,751      instructions                     #    1.44  insn per cycle         
-       0.731489352 seconds time elapsed
+TOTAL       :     0.462395 sec
+     2,001,608,406      cycles                           #    2.941 GHz                    
+     2,866,642,977      instructions                     #    1.43  insn per cycle         
+       0.738112039 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.120884e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.312101e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.322916e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.109030e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.310930e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.322842e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.593653 sec
-     2,438,307,110      cycles                           #    2.956 GHz                    
-     3,770,815,852      instructions                     #    1.55  insn per cycle         
-       0.884294118 seconds time elapsed
+TOTAL       :     0.592309 sec
+     2,454,004,684      cycles                           #    2.967 GHz                    
+     3,701,468,710      instructions                     #    1.51  insn per cycle         
+       0.885901852 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.568377e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.581048e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.581048e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.546247e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.558939e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.558939e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.401933 sec
-    19,482,758,220      cycles                           #    3.042 GHz                    
-    58,802,978,389      instructions                     #    3.02  insn per cycle         
-       6.406140471 seconds time elapsed
+TOTAL       :     6.457597 sec
+    19,573,619,879      cycles                           #    3.030 GHz                    
+    58,802,481,580      instructions                     #    3.00  insn per cycle         
+       6.461777687 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1313) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.917983e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.963815e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.963815e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.793642e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.840400e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.840400e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.353380 sec
-    10,239,214,469      cycles                           #    3.050 GHz                    
-    30,351,045,797      instructions                     #    2.96  insn per cycle         
-       3.357673213 seconds time elapsed
+TOTAL       :     3.440445 sec
+    10,252,301,234      cycles                           #    2.977 GHz                    
+    30,351,085,669      instructions                     #    2.96  insn per cycle         
+       3.444877379 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4970) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.402320e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.570383e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.570383e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.384802e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.551869e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.551869e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.764710 sec
-     5,042,998,580      cycles                           #    2.852 GHz                    
-    11,486,615,235      instructions                     #    2.28  insn per cycle         
-       1.768978894 seconds time elapsed
+TOTAL       :     1.768254 sec
+     5,044,938,195      cycles                           #    2.848 GHz                    
+    11,486,596,301      instructions                     #    2.28  insn per cycle         
+       1.772428896 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4591) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.003860e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.023445e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.023445e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.019018e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.038703e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.038703e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.654433 sec
-     4,647,317,234      cycles                           #    2.803 GHz                    
-    10,844,918,785      instructions                     #    2.33  insn per cycle         
-       1.658681615 seconds time elapsed
+TOTAL       :     1.630183 sec
+     4,647,706,592      cycles                           #    2.845 GHz                    
+    10,845,108,593      instructions                     #    2.33  insn per cycle         
+       1.634411362 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4183) (512y:  244) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.419133e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.526721e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.526721e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.188773e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.290125e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.290125e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.233568 sec
-     4,119,227,015      cycles                           #    1.842 GHz                    
-     6,111,995,104      instructions                     #    1.48  insn per cycle         
-       2.238507475 seconds time elapsed
+TOTAL       :     2.304290 sec
+     4,123,403,300      cycles                           #    1.794 GHz                    
+     6,113,558,333      instructions                     #    1.48  insn per cycle         
+       2.308644720 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1457) (512y:  139) (512z: 3568)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index 906002ccef..0b448796b2 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:07:06
+DATE: 2023-11-08_21:22:02
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.570718e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.332431e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.423909e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.567286e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.376211e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.468457e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.445719 sec
-     1,977,839,409      cycles                           #    2.946 GHz                    
-     2,766,831,818      instructions                     #    1.40  insn per cycle         
-       0.728762524 seconds time elapsed
+TOTAL       :     0.444374 sec
+     1,959,403,583      cycles                           #    2.932 GHz                    
+     2,755,627,615      instructions                     #    1.41  insn per cycle         
+       0.725331091 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.444258e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.461256e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.527187e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.353667e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.408300e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.476909e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630099e+02 +- 4.770719e+02 )  GeV^-2
-TOTAL       :     0.490311 sec
-     2,098,277,441      cycles                           #    2.940 GHz                    
-     3,050,395,563      instructions                     #    1.45  insn per cycle         
-       0.771282830 seconds time elapsed
+TOTAL       :     0.490778 sec
+     2,119,348,519      cycles                           #    2.946 GHz                    
+     3,045,536,225      instructions                     #    1.44  insn per cycle         
+       0.776414109 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        32,139,063      cycles                           #    2.763 GHz                    
-        49,369,582      instructions                     #    1.54  insn per cycle         
-       0.012019390 seconds time elapsed
+        31,825,625      cycles                           #    2.791 GHz                    
+        48,514,379      instructions                     #    1.52  insn per cycle         
+       0.011782396 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1034) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
index afa8c22c25..2f35cf010a 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:36:42
+DATE: 2023-11-08_21:53:11
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.935100e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.139273e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.139273e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.915722e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.200179e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.200179e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009071e+02 +- 5.002295e+01 )  GeV^-2
-TOTAL       :     0.458037 sec
-     1,958,659,698      cycles                           #    2.936 GHz                    
-     2,907,533,469      instructions                     #    1.48  insn per cycle         
-       0.726231579 seconds time elapsed
+TOTAL       :     0.459965 sec
+     1,913,489,356      cycles                           #    2.854 GHz                    
+     2,835,494,218      instructions                     #    1.48  insn per cycle         
+       0.728586503 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.639472e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.576828e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.576828e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.767536e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.641642e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.641642e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.737500e+02 +- 4.776370e+02 )  GeV^-2
-TOTAL       :     0.638235 sec
-     2,567,083,186      cycles                           #    2.951 GHz                    
-     3,965,073,751      instructions                     #    1.54  insn per cycle         
-       0.927254467 seconds time elapsed
+TOTAL       :     0.634368 sec
+     2,553,649,677      cycles                           #    2.951 GHz                    
+     3,942,242,941      instructions                     #    1.54  insn per cycle         
+       0.922459199 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -99,9 +99,9 @@ OK (relative difference <= 5E-3)
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Instantiate host Bridge (nevt=16384)
-        38,813,158      cycles                           #    2.791 GHz                    
-        52,008,055      instructions                     #    1.34  insn per cycle         
-       0.014463641 seconds time elapsed
+        38,286,300      cycles                           #    2.778 GHz                    
+        51,959,635      instructions                     #    1.36  insn per cycle         
+       0.014194921 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1034) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
index e0c37ae81b..e630fbc27d 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:07:15
+DATE: 2023-11-08_21:22:11
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.552711e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.312060e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.409477e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.560442e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.377270e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.470091e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.443645 sec
-     1,939,887,285      cycles                           #    2.958 GHz                    
-     2,753,223,301      instructions                     #    1.42  insn per cycle         
-       0.713433638 seconds time elapsed
+TOTAL       :     0.443339 sec
+     1,943,957,931      cycles                           #    2.944 GHz                    
+     2,765,105,739      instructions                     #    1.42  insn per cycle         
+       0.717258208 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.420862e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.422248e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.487501e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.360432e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.412708e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.481720e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630099e+02 +- 4.770719e+02 )  GeV^-2
-TOTAL       :     0.489840 sec
-     2,095,642,051      cycles                           #    2.944 GHz                    
-     3,058,032,700      instructions                     #    1.46  insn per cycle         
-       0.771189239 seconds time elapsed
+TOTAL       :     0.491895 sec
+     2,104,648,838      cycles                           #    2.938 GHz                    
+     3,025,148,863      instructions                     #    1.44  insn per cycle         
+       0.773979442 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        31,454,006      cycles                           #    2.782 GHz                    
-        48,514,001      instructions                     #    1.54  insn per cycle         
-       0.011695448 seconds time elapsed
+        31,662,761      cycles                           #    2.798 GHz                    
+        47,511,797      instructions                     #    1.50  insn per cycle         
+       0.011712916 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1029) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 9bd85e98d0..e83376e827 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:07:25
+DATE: 2023-11-08_21:22:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.981637e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.050998e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.064107e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.888685e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.043488e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.056349e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.460239 sec
-     1,991,164,692      cycles                           #    2.956 GHz                    
-     2,861,513,835      instructions                     #    1.44  insn per cycle         
-       0.731121053 seconds time elapsed
+TOTAL       :     0.461575 sec
+     1,992,206,499      cycles                           #    2.947 GHz                    
+     2,868,298,614      instructions                     #    1.44  insn per cycle         
+       0.733257197 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.125939e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.318916e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.329956e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.111138e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.315581e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.327177e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.595711 sec
-     2,444,157,832      cycles                           #    2.957 GHz                    
-     3,696,457,333      instructions                     #    1.51  insn per cycle         
-       0.888026518 seconds time elapsed
+TOTAL       :     0.598628 sec
+     2,465,744,279      cycles                           #    2.958 GHz                    
+     3,812,193,472      instructions                     #    1.55  insn per cycle         
+       0.893336251 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        35,021,922      cycles                           #    2.756 GHz                    
-        50,809,631      instructions                     #    1.45  insn per cycle         
-       0.013111359 seconds time elapsed
+        34,711,490      cycles                           #    2.787 GHz                    
+        50,039,456      instructions                     #    1.44  insn per cycle         
+       0.012986618 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1399) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
index 659836495f..ab62773e76 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-03_19:07:34
+DATE: 2023-11-08_21:22:30
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.948465e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.041856e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.054410e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.840662e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.037949e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.050999e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.460434 sec
-     1,981,925,545      cycles                           #    2.941 GHz                    
-     2,855,578,890      instructions                     #    1.44  insn per cycle         
-       0.731466835 seconds time elapsed
+TOTAL       :     0.462948 sec
+     1,939,550,045      cycles                           #    2.866 GHz                    
+     2,822,181,727      instructions                     #    1.46  insn per cycle         
+       0.733825753 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.114794e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.303596e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.314294e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.102587e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.303113e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.314475e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.592739 sec
-     2,423,209,817      cycles                           #    2.940 GHz                    
-     3,698,114,761      instructions                     #    1.53  insn per cycle         
-       0.885260737 seconds time elapsed
+TOTAL       :     0.591515 sec
+     2,444,078,815      cycles                           #    2.952 GHz                    
+     3,674,116,474      instructions                     #    1.50  insn per cycle         
+       0.887442466 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        34,542,827      cycles                           #    2.778 GHz                    
-        50,097,141      instructions                     #    1.45  insn per cycle         
-       0.012808089 seconds time elapsed
+        34,181,769      cycles                           #    2.772 GHz                    
+        49,201,973      instructions                     #    1.44  insn per cycle         
+       0.012846211 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1276) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index a9f9e7f9b0..0e571e2957 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:07:44
+DATE: 2023-11-08_21:22:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.471280e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.495513e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.497667e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.509565e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.535938e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.538049e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.521778 sec
-     2,221,753,731      cycles                           #    2.953 GHz                    
-     3,509,979,793      instructions                     #    1.58  insn per cycle         
-       0.811888374 seconds time elapsed
+TOTAL       :     0.522429 sec
+     2,216,464,510      cycles                           #    2.948 GHz                    
+     3,445,335,287      instructions                     #    1.55  insn per cycle         
+       0.813178007 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.130694e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.157314e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.158457e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.124490e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.152981e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.154204e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.024926 sec
-     9,877,023,451      cycles                           #    3.016 GHz                    
-    20,938,621,148      instructions                     #    2.12  insn per cycle         
-       3.332222792 seconds time elapsed
+TOTAL       :     3.028693 sec
+     9,700,865,704      cycles                           #    2.960 GHz                    
+    20,299,179,534      instructions                     #    2.09  insn per cycle         
+       3.337900982 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.942881e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.943811e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.943811e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.948157e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.949119e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.949119e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.450914 sec
-    25,661,004,969      cycles                           #    3.035 GHz                    
-    78,943,064,293      instructions                     #    3.08  insn per cycle         
-       8.455241133 seconds time elapsed
+TOTAL       :     8.428390 sec
+    25,658,286,461      cycles                           #    3.043 GHz                    
+    78,943,496,553      instructions                     #    3.08  insn per cycle         
+       8.432674701 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.566286e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.569647e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.569647e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.638426e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.641828e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.641828e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.607952 sec
-    12,925,846,736      cycles                           #    2.803 GHz                    
-    39,287,875,718      instructions                     #    3.04  insn per cycle         
-       4.612260028 seconds time elapsed
+TOTAL       :     4.516511 sec
+    12,940,511,466      cycles                           #    2.863 GHz                    
+    39,286,083,355      instructions                     #    3.04  insn per cycle         
+       4.520821646 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.376392e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.393376e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.393376e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.063000e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.079398e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.079398e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.967322 sec
-     5,576,808,906      cycles                           #    2.829 GHz                    
-    13,690,679,702      instructions                     #    2.45  insn per cycle         
-       1.971661788 seconds time elapsed
+TOTAL       :     2.043453 sec
+     5,578,804,578      cycles                           #    2.725 GHz                    
+    13,689,979,347      instructions                     #    2.45  insn per cycle         
+       2.047766279 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.568825e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.591271e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.591271e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.584845e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.608001e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.608001e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.723570 sec
-     4,897,962,779      cycles                           #    2.836 GHz                    
-    12,345,795,320      instructions                     #    2.52  insn per cycle         
-       1.727906957 seconds time elapsed
+TOTAL       :     1.720447 sec
+     4,895,207,627      cycles                           #    2.839 GHz                    
+    12,344,429,833      instructions                     #    2.52  insn per cycle         
+       1.724685286 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.463403e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.476893e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.476893e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.405020e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.418567e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.418567e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.207008 sec
-     4,113,706,051      cycles                           #    1.861 GHz                    
-     6,338,446,257      instructions                     #    1.54  insn per cycle         
-       2.211395304 seconds time elapsed
+TOTAL       :     2.224337 sec
+     4,116,450,066      cycles                           #    1.848 GHz                    
+     6,337,280,624      instructions                     #    1.54  insn per cycle         
+       2.228619766 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
index 05b9b7b471..6cfffac867 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:37:26
+DATE: 2023-11-08_21:53:56
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.138586e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.475297e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.475297e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.140206e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.481973e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.481973e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.514369 sec
-     2,174,774,169      cycles                           #    2.935 GHz                    
-     3,408,753,270      instructions                     #    1.57  insn per cycle         
-       0.802511668 seconds time elapsed
+TOTAL       :     0.512248 sec
+     2,184,996,199      cycles                           #    2.952 GHz                    
+     3,435,282,796      instructions                     #    1.57  insn per cycle         
+       0.800472589 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.635405e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.119639e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.119639e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.623195e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.099384e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.099384e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.311178 sec
-    10,730,531,324      cycles                           #    2.994 GHz                    
-    24,179,707,994      instructions                     #    2.25  insn per cycle         
-       3.640277810 seconds time elapsed
+TOTAL       :     3.306442 sec
+    10,620,771,247      cycles                           #    2.970 GHz                    
+    24,014,706,294      instructions                     #    2.26  insn per cycle         
+       3.633696672 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.906612e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.907549e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.907549e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.935055e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.935984e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.935984e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.615680 sec
-    25,666,310,685      cycles                           #    2.978 GHz                    
-    78,949,148,944      instructions                     #    3.08  insn per cycle         
-       8.620265583 seconds time elapsed
+TOTAL       :     8.489050 sec
+    25,665,712,522      cycles                           #    3.023 GHz                    
+    78,953,227,075      instructions                     #    3.08  insn per cycle         
+       8.493532453 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.685334e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.688850e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.688850e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.600578e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.604115e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.604115e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.463406 sec
-    12,942,626,026      cycles                           #    2.897 GHz                    
-    39,297,696,719      instructions                     #    3.04  insn per cycle         
-       4.468216686 seconds time elapsed
+TOTAL       :     4.569107 sec
+    12,945,693,806      cycles                           #    2.831 GHz                    
+    39,298,314,532      instructions                     #    3.04  insn per cycle         
+       4.573645709 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.403877e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.422097e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.422097e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.385455e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.402719e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.402719e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.965161 sec
-     5,597,716,321      cycles                           #    2.843 GHz                    
-    13,700,115,311      instructions                     #    2.45  insn per cycle         
-       1.969720229 seconds time elapsed
+TOTAL       :     1.969364 sec
+     5,591,964,229      cycles                           #    2.834 GHz                    
+    13,700,332,532      instructions                     #    2.45  insn per cycle         
+       1.973976640 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.573549e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.596918e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.596918e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.515181e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.538996e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.538996e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.726627 sec
-     4,910,197,742      cycles                           #    2.838 GHz                    
-    12,354,930,161      instructions                     #    2.52  insn per cycle         
-       1.731069519 seconds time elapsed
+TOTAL       :     1.736968 sec
+     4,912,884,670      cycles                           #    2.825 GHz                    
+    12,356,069,233      instructions                     #    2.52  insn per cycle         
+       1.741510676 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.408369e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.421923e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.421923e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.401693e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.415615e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.415615e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.227463 sec
-     4,132,274,023      cycles                           #    1.852 GHz                    
-     6,348,232,709      instructions                     #    1.54  insn per cycle         
-       2.231941444 seconds time elapsed
+TOTAL       :     2.229815 sec
+     4,139,073,894      cycles                           #    1.853 GHz                    
+     6,348,807,900      instructions                     #    1.53  insn per cycle         
+       2.234437952 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
index d4a13c45dc..829db14182 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:49:13
+DATE: 2023-11-08_22:05:32
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.490628e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.519771e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.522013e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.498087e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.524326e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.526521e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     0.506263 sec
-     2,193,209,541      cycles                           #    2.934 GHz                    
-     3,448,112,270      instructions                     #    1.57  insn per cycle         
-       0.811794626 seconds time elapsed
+TOTAL       :     0.505963 sec
+     2,230,602,584      cycles                           #    2.998 GHz                    
+     3,509,146,743      instructions                     #    1.57  insn per cycle         
+       0.814638005 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.140777e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.174961e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.176419e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.138629e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.170285e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.171692e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.252232e+02 +- 1.234346e+02 )  GeV^-4
-TOTAL       :     3.133332 sec
-    10,144,803,574      cycles                           #    2.992 GHz                    
-    22,979,164,856      instructions                     #    2.27  insn per cycle         
-       3.446699997 seconds time elapsed
+TOTAL       :     3.117149 sec
+    10,263,257,910      cycles                           #    3.044 GHz                    
+    22,984,843,224      instructions                     #    2.24  insn per cycle         
+       3.428387488 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.934897e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.935823e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.935823e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.955282e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.956235e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.956235e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     8.487397 sec
-    25,642,144,633      cycles                           #    3.020 GHz                    
-    78,942,503,354      instructions                     #    3.08  insn per cycle         
-       8.491509185 seconds time elapsed
+TOTAL       :     8.398336 sec
+    25,654,945,707      cycles                           #    3.057 GHz                    
+    78,946,836,924      instructions                     #    3.08  insn per cycle         
+       8.402318295 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.604711e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.608085e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.608085e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.739022e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.742322e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.742322e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     4.560510 sec
-    12,949,935,406      cycles                           #    2.841 GHz                    
-    39,287,959,625      instructions                     #    3.03  insn per cycle         
-       4.564590789 seconds time elapsed
+TOTAL       :     4.397110 sec
+    12,932,706,473      cycles                           #    2.939 GHz                    
+    39,284,078,298      instructions                     #    3.04  insn per cycle         
+       4.401176578 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.331820e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.349574e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.349574e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.547122e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.565515e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.565515e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.979581 sec
-     5,585,242,942      cycles                           #    2.817 GHz                    
-    13,688,645,923      instructions                     #    2.45  insn per cycle         
-       1.983846301 seconds time elapsed
+TOTAL       :     1.929747 sec
+     5,584,587,761      cycles                           #    2.889 GHz                    
+    13,688,784,163      instructions                     #    2.45  insn per cycle         
+       1.933938249 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.501909e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.523734e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.523734e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.712996e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.736353e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.736353e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.737131 sec
-     4,904,473,574      cycles                           #    2.818 GHz                    
-    12,343,066,066      instructions                     #    2.52  insn per cycle         
-       1.741373569 seconds time elapsed
+TOTAL       :     1.699524 sec
+     4,899,825,358      cycles                           #    2.877 GHz                    
+    12,342,496,756      instructions                     #    2.52  insn per cycle         
+       1.703963805 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.326865e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.339889e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.339889e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.584277e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.599062e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.599062e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     2.249568 sec
-     4,122,823,033      cycles                           #    1.830 GHz                    
-     6,335,244,526      instructions                     #    1.54  insn per cycle         
-       2.253741280 seconds time elapsed
+TOTAL       :     2.173644 sec
+     4,127,419,767      cycles                           #    1.897 GHz                    
+     6,336,272,499      instructions                     #    1.54  insn per cycle         
+       2.177878840 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
index 8a019b9732..35703491ac 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:45:52
+DATE: 2023-11-08_22:02:14
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.497991e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.525524e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.527678e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.483209e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.509549e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.511610e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.505150 sec
-     2,198,803,568      cycles                           #    2.954 GHz                    
-     3,469,496,289      instructions                     #    1.58  insn per cycle         
-       0.812740673 seconds time elapsed
+TOTAL       :     0.505331 sec
+     2,237,004,452      cycles                           #    3.017 GHz                    
+     3,469,560,739      instructions                     #    1.55  insn per cycle         
+       0.813831791 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.149366e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.183697e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.185208e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.137446e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.169549e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.170864e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.069067 sec
-     9,961,450,693      cycles                           #    3.001 GHz                    
-    22,775,488,914      instructions                     #    2.29  insn per cycle         
-       3.378594275 seconds time elapsed
+TOTAL       :     3.063844 sec
+    10,025,654,279      cycles                           #    3.024 GHz                    
+    22,437,691,349      instructions                     #    2.24  insn per cycle         
+       3.371428026 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.919154e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.920062e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.920062e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.972408e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.973332e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.973332e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.555088 sec
-    25,630,164,257      cycles                           #    2.995 GHz                    
-    78,942,698,347      instructions                     #    3.08  insn per cycle         
-       8.559388166 seconds time elapsed
+TOTAL       :     8.324018 sec
+    25,644,049,472      cycles                           #    3.080 GHz                    
+    78,945,889,994      instructions                     #    3.08  insn per cycle         
+       8.328093218 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.673575e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.677034e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.677034e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.757960e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.761409e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.761409e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.474572 sec
-    12,938,774,287      cycles                           #    2.890 GHz                    
-    39,284,863,862      instructions                     #    3.04  insn per cycle         
-       4.478882140 seconds time elapsed
+TOTAL       :     4.373690 sec
+    12,932,578,462      cycles                           #    2.955 GHz                    
+    39,286,223,538      instructions                     #    3.04  insn per cycle         
+       4.377750469 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.365364e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.382422e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.382422e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.504027e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.521553e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.521553e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.970072 sec
-     5,585,160,191      cycles                           #    2.830 GHz                    
-    13,689,327,859      instructions                     #    2.45  insn per cycle         
-       1.974279626 seconds time elapsed
+TOTAL       :     1.937880 sec
+     5,579,002,067      cycles                           #    2.875 GHz                    
+    13,689,941,055      instructions                     #    2.45  insn per cycle         
+       1.941926119 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.573694e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.596726e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.596726e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.762551e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.785341e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.785341e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.722482 sec
-     4,895,075,879      cycles                           #    2.836 GHz                    
-    12,344,411,096      instructions                     #    2.52  insn per cycle         
-       1.726704102 seconds time elapsed
+TOTAL       :     1.689011 sec
+     4,900,729,891      cycles                           #    2.896 GHz                    
+    12,344,260,353      instructions                     #    2.52  insn per cycle         
+       1.693208802 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.342892e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.356180e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.356180e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.678242e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.692622e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.692622e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.243467 sec
-     4,145,301,834      cycles                           #    1.845 GHz                    
-     6,337,134,423      instructions                     #    1.53  insn per cycle         
-       2.247770943 seconds time elapsed
+TOTAL       :     2.144938 sec
+     4,120,050,897      cycles                           #    1.918 GHz                    
+     6,337,719,473      instructions                     #    1.54  insn per cycle         
+       2.149063218 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
index 0761c0d014..e3bb9b2d2b 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:42:34
+DATE: 2023-11-08_21:59:02
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.224877e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.534029e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.536870e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.202444e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.496466e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.498519e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.509685 sec
-     2,194,677,974      cycles                           #    2.952 GHz                    
-     3,468,699,947      instructions                     #    1.58  insn per cycle         
-       0.805522488 seconds time elapsed
+TOTAL       :     0.507729 sec
+     2,224,053,547      cycles                           #    2.995 GHz                    
+     3,511,447,697      instructions                     #    1.58  insn per cycle         
+       0.804264263 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.741528e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.176834e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.178277e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.754243e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.177673e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.179050e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.196085 sec
-    10,332,938,289      cycles                           #    2.993 GHz                    
-    23,233,171,839      instructions                     #    2.25  insn per cycle         
-       3.511259911 seconds time elapsed
+TOTAL       :     3.195418 sec
+    10,560,218,824      cycles                           #    3.053 GHz                    
+    23,272,224,469      instructions                     #    2.20  insn per cycle         
+       3.516017944 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -94,14 +94,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.927835e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.928807e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.928807e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.980652e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.981660e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.981660e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.516575 sec
-    25,626,746,874      cycles                           #    3.008 GHz                    
-    78,942,783,638      instructions                     #    3.08  insn per cycle         
-       8.520860421 seconds time elapsed
+TOTAL       :     8.289291 sec
+    25,689,530,854      cycles                           #    3.098 GHz                    
+    78,941,485,494      instructions                     #    3.07  insn per cycle         
+       8.293329329 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -121,14 +121,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.674456e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.677849e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.677849e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.695812e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.699396e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.699396e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.472834 sec
-    12,938,647,402      cycles                           #    2.891 GHz                    
-    39,285,558,550      instructions                     #    3.04  insn per cycle         
-       4.477166946 seconds time elapsed
+TOTAL       :     4.446640 sec
+    12,939,707,143      cycles                           #    2.908 GHz                    
+    39,286,790,527      instructions                     #    3.04  insn per cycle         
+       4.450934428 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -148,14 +148,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.290335e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.307469e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.307469e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.540564e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.557570e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.557570e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.987857 sec
-     5,582,015,296      cycles                           #    2.804 GHz                    
-    13,690,066,849      instructions                     #    2.45  insn per cycle         
-       1.992149312 seconds time elapsed
+TOTAL       :     1.929715 sec
+     5,584,326,574      cycles                           #    2.891 GHz                    
+    13,690,307,414      instructions                     #    2.45  insn per cycle         
+       1.933841922 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -175,14 +175,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.537627e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.561759e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.561759e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.772043e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.794673e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.794673e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.729438 sec
-     4,899,116,746      cycles                           #    2.827 GHz                    
-    12,344,356,410      instructions                     #    2.52  insn per cycle         
-       1.733854664 seconds time elapsed
+TOTAL       :     1.687536 sec
+     4,894,600,072      cycles                           #    2.895 GHz                    
+    12,345,111,795      instructions                     #    2.52  insn per cycle         
+       1.691722733 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -202,14 +202,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.331605e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.345774e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.345774e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.667022e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.680748e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.680748e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.247519 sec
-     4,126,377,191      cycles                           #    1.833 GHz                    
-     6,337,288,668      instructions                     #    1.54  insn per cycle         
-       2.251874954 seconds time elapsed
+TOTAL       :     2.148381 sec
+     4,119,534,680      cycles                           #    1.915 GHz                    
+     6,337,066,991      instructions                     #    1.54  insn per cycle         
+       2.152520896 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
index d519ec18af..2d6466a5d0 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:08:21
+DATE: 2023-11-08_21:23:17
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.482135e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.509267e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.511176e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.472415e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.497562e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.499582e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.519206 sec
-     2,212,325,201      cycles                           #    2.954 GHz                    
-     3,433,704,735      instructions                     #    1.55  insn per cycle         
-       0.807580904 seconds time elapsed
+TOTAL       :     0.522603 sec
+     2,199,879,357      cycles                           #    2.926 GHz                    
+     3,406,329,945      instructions                     #    1.55  insn per cycle         
+       0.812598895 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.159162e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.186085e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.187240e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.151978e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.180898e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.182114e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.004869 sec
-     9,812,463,662      cycles                           #    3.013 GHz                    
-    21,581,231,713      instructions                     #    2.20  insn per cycle         
-       3.312573877 seconds time elapsed
+TOTAL       :     3.012462 sec
+     9,824,681,781      cycles                           #    3.013 GHz                    
+    20,251,773,236      instructions                     #    2.06  insn per cycle         
+       3.320916673 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.947345e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.948277e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.948277e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.948786e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.949722e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.949722e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.431137 sec
-    25,590,035,480      cycles                           #    3.034 GHz                    
-    78,715,048,416      instructions                     #    3.08  insn per cycle         
-       8.435307792 seconds time elapsed
+TOTAL       :     8.425425 sec
+    25,600,858,897      cycles                           #    3.038 GHz                    
+    78,714,675,174      instructions                     #    3.07  insn per cycle         
+       8.429623210 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4263) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.620452e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.623805e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.623805e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.648721e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.652034e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.652034e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.539871 sec
-    12,909,848,042      cycles                           #    2.843 GHz                    
-    39,233,023,972      instructions                     #    3.04  insn per cycle         
-       4.544176080 seconds time elapsed
+TOTAL       :     4.503525 sec
+    12,897,071,716      cycles                           #    2.862 GHz                    
+    39,231,170,693      instructions                     #    3.04  insn per cycle         
+       4.507786711 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:12949) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.331174e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.348654e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.348654e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.358235e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.375211e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.375211e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.977747 sec
-     5,618,064,764      cycles                           #    2.836 GHz                    
-    13,804,762,963      instructions                     #    2.46  insn per cycle         
-       1.981982814 seconds time elapsed
+TOTAL       :     1.971459 sec
+     5,607,121,481      cycles                           #    2.839 GHz                    
+    13,803,544,350      instructions                     #    2.46  insn per cycle         
+       1.975775051 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11422) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.463129e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.484771e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.484771e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.338508e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.360185e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.360185e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.742192 sec
-     4,960,747,667      cycles                           #    2.842 GHz                    
-    12,470,817,922      instructions                     #    2.51  insn per cycle         
-       1.746604551 seconds time elapsed
+TOTAL       :     1.768893 sec
+     4,962,697,559      cycles                           #    2.805 GHz                    
+    12,469,802,045      instructions                     #    2.51  insn per cycle         
+       1.786199910 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10258) (512y:  240) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.427183e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.440655e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.440655e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.426426e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.440315e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.440315e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.217977 sec
-     4,119,292,054      cycles                           #    1.855 GHz                    
-     6,462,314,928      instructions                     #    1.57  insn per cycle         
-       2.222289185 seconds time elapsed
+TOTAL       :     2.218010 sec
+     4,123,694,980      cycles                           #    1.856 GHz                    
+     6,461,412,200      instructions                     #    1.57  insn per cycle         
+       2.222394946 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1647) (512y:  192) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
index 0e734b6c9d..a4e352ee76 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:26:58
+DATE: 2023-11-08_21:43:16
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.237666e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.262462e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.264647e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.232524e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.256814e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.259170e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.533653 sec
-     2,219,666,724      cycles                           #    2.910 GHz                    
-     3,445,153,040      instructions                     #    1.55  insn per cycle         
-       0.821091738 seconds time elapsed
+TOTAL       :     0.534744 sec
+     2,248,485,126      cycles                           #    2.941 GHz                    
+     3,494,101,027      instructions                     #    1.55  insn per cycle         
+       0.823969121 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.775197e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.803191e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.804422e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.777627e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.804807e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.805966e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.300230 sec
-    10,634,484,052      cycles                           #    2.991 GHz                    
-    23,844,861,281      instructions                     #    2.24  insn per cycle         
-       3.611693691 seconds time elapsed
+TOTAL       :     3.297104 sec
+    10,673,501,263      cycles                           #    3.005 GHz                    
+    24,226,094,920      instructions                     #    2.27  insn per cycle         
+       3.607615064 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.361422e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.361903e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.361903e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.346513e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.346993e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.346993e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    37.613350 sec
-   113,653,626,732      cycles                           #    3.022 GHz                    
-   144,966,182,806      instructions                     #    1.28  insn per cycle         
-      37.617592948 seconds time elapsed
+TOTAL       :    37.741985 sec
+   113,582,106,901      cycles                           #    3.009 GHz                    
+   144,968,769,114      instructions                     #    1.28  insn per cycle         
+      37.746219696 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:21605) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.197160e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.199710e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.199710e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.143430e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.145919e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.145919e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.138561 sec
-    14,751,525,638      cycles                           #    2.870 GHz                    
-    37,578,516,323      instructions                     #    2.55  insn per cycle         
-       5.143061031 seconds time elapsed
+TOTAL       :     5.226537 sec
+    14,726,949,716      cycles                           #    2.816 GHz                    
+    37,578,521,140      instructions                     #    2.55  insn per cycle         
+       5.230978594 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:68118) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.662015e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.676566e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.676566e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.619134e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.633428e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.633428e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.150367 sec
-     6,125,090,080      cycles                           #    2.844 GHz                    
-    13,063,740,704      instructions                     #    2.13  insn per cycle         
-       2.154679772 seconds time elapsed
+TOTAL       :     2.162328 sec
+     6,132,958,052      cycles                           #    2.832 GHz                    
+    13,063,746,182      instructions                     #    2.13  insn per cycle         
+       2.166766443 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:46960) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.263953e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.285040e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.285040e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.242664e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.263271e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.263271e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.780016 sec
-     5,060,160,878      cycles                           #    2.837 GHz                    
-    11,442,229,361      instructions                     #    2.26  insn per cycle         
-       1.784487029 seconds time elapsed
+TOTAL       :     1.783918 sec
+     5,064,574,027      cycles                           #    2.835 GHz                    
+    11,442,541,397      instructions                     #    2.26  insn per cycle         
+       1.788276031 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:40434) (512y:  285) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.515689e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.530167e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.530167e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.693472e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.708550e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.708550e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.192230 sec
-     3,982,582,654      cycles                           #    1.814 GHz                    
-     5,943,874,364      instructions                     #    1.49  insn per cycle         
-       2.196624515 seconds time elapsed
+TOTAL       :     2.141610 sec
+     3,984,341,945      cycles                           #    1.859 GHz                    
+     5,944,587,769      instructions                     #    1.49  insn per cycle         
+       2.145941939 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2455) (512y:  337) (512z:39411)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
index a431669edb..c9a3c0bc00 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:28:07
+DATE: 2023-11-08_21:44:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.227099e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.252215e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.254306e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.238547e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.263632e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.265593e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.530677 sec
-     2,254,800,400      cycles                           #    2.956 GHz                    
-     3,541,881,168      instructions                     #    1.57  insn per cycle         
-       0.819833622 seconds time elapsed
+TOTAL       :     0.528864 sec
+     2,246,214,726      cycles                           #    2.961 GHz                    
+     3,512,868,349      instructions                     #    1.56  insn per cycle         
+       0.816400547 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.792463e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.821318e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.822521e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.792504e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.819675e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.820783e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.276536 sec
-    10,598,798,874      cycles                           #    3.001 GHz                    
-    22,505,546,793      instructions                     #    2.12  insn per cycle         
-       3.590880872 seconds time elapsed
+TOTAL       :     3.270254 sec
+    10,633,900,320      cycles                           #    3.014 GHz                    
+    24,514,837,826      instructions                     #    2.31  insn per cycle         
+       3.584387558 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.316847e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.317310e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.317310e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.327617e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.328084e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.328084e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    38.002712 sec
-   114,613,209,494      cycles                           #    3.016 GHz                    
-   145,560,103,749      instructions                     #    1.27  insn per cycle         
-      38.007069023 seconds time elapsed
+TOTAL       :    37.906136 sec
+   114,405,747,001      cycles                           #    3.018 GHz                    
+   145,562,165,740      instructions                     #    1.27  insn per cycle         
+      37.910396057 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:22248) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.101440e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.103871e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.103871e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.120905e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.123383e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.123383e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.297737 sec
-    15,180,958,119      cycles                           #    2.864 GHz                    
-    37,765,704,407      instructions                     #    2.49  insn per cycle         
-       5.302092232 seconds time elapsed
+TOTAL       :     5.264434 sec
+    15,164,870,179      cycles                           #    2.879 GHz                    
+    37,765,103,372      instructions                     #    2.49  insn per cycle         
+       5.268658441 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:68446) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.750289e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.764988e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.764988e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.815263e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.829969e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.829969e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.125646 sec
-     6,006,519,083      cycles                           #    2.821 GHz                    
-    12,897,926,690      instructions                     #    2.15  insn per cycle         
-       2.130039886 seconds time elapsed
+TOTAL       :     2.107998 sec
+     6,006,546,140      cycles                           #    2.845 GHz                    
+    12,898,448,008      instructions                     #    2.15  insn per cycle         
+       2.112261899 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:45929) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.134516e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.155464e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.155464e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.170106e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.191645e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.191645e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.805195 sec
-     5,111,264,978      cycles                           #    2.826 GHz                    
-    11,448,660,091      instructions                     #    2.24  insn per cycle         
-       1.809562076 seconds time elapsed
+TOTAL       :     1.798019 sec
+     5,110,595,937      cycles                           #    2.837 GHz                    
+    11,448,746,145      instructions                     #    2.24  insn per cycle         
+       1.802331588 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:40123) (512y:  219) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.713307e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.727980e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.727980e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.719086e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.733849e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.733849e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.136153 sec
-     3,956,606,945      cycles                           #    1.850 GHz                    
-     5,898,384,643      instructions                     #    1.49  insn per cycle         
-       2.140545061 seconds time elapsed
+TOTAL       :     2.134583 sec
+     3,969,461,110      cycles                           #    1.857 GHz                    
+     5,897,831,571      instructions                     #    1.49  insn per cycle         
+       2.138816528 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1971) (512y:  259) (512z:38937)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 389fe370ef..9c1de01f16 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:08:57
+DATE: 2023-11-08_21:23:53
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.330449e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.375316e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.385679e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.293342e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.339166e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.344348e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.478801 sec
-     2,034,971,060      cycles                           #    2.940 GHz                    
-     3,054,212,240      instructions                     #    1.50  insn per cycle         
-       0.749375620 seconds time elapsed
+TOTAL       :     0.481289 sec
+     2,043,429,418      cycles                           #    2.945 GHz                    
+     3,016,391,404      instructions                     #    1.48  insn per cycle         
+       0.753087040 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.529589e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.587136e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.589764e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.613713e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.676727e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.679629e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.723184 sec
-     5,782,983,871      cycles                           #    2.964 GHz                    
-    12,066,403,823      instructions                     #    2.09  insn per cycle         
-       2.008243733 seconds time elapsed
+TOTAL       :     1.713007 sec
+     5,846,211,987      cycles                           #    2.997 GHz                    
+    12,059,135,892      instructions                     #    2.06  insn per cycle         
+       2.007812305 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.003677e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.004662e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.004662e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.005115e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.006106e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.006106e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.193664 sec
-    24,655,416,435      cycles                           #    3.008 GHz                    
-    78,134,412,275      instructions                     #    3.17  insn per cycle         
-       8.197717930 seconds time elapsed
+TOTAL       :     8.187511 sec
+    24,627,671,323      cycles                           #    3.007 GHz                    
+    78,134,663,224      instructions                     #    3.17  insn per cycle         
+       8.191568767 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.270897e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.285143e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.285143e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.313136e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.326827e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.326827e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.263632 sec
-     6,475,526,341      cycles                           #    2.856 GHz                    
-    20,124,982,632      instructions                     #    3.11  insn per cycle         
-       2.267936828 seconds time elapsed
+TOTAL       :     2.250414 sec
+     6,477,846,372      cycles                           #    2.874 GHz                    
+    20,124,481,745      instructions                     #    3.11  insn per cycle         
+       2.254575609 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.655891e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.662862e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.662862e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.651750e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.658578e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.658578e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.998679 sec
-     2,840,454,971      cycles                           #    2.834 GHz                    
-     6,992,590,525      instructions                     #    2.46  insn per cycle         
-       1.002898964 seconds time elapsed
+TOTAL       :     1.000733 sec
+     2,836,203,846      cycles                           #    2.824 GHz                    
+     6,991,580,060      instructions                     #    2.47  insn per cycle         
+       1.005051926 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.904708e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.914180e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.914180e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.891596e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.900607e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.900607e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.868982 sec
-     2,491,374,231      cycles                           #    2.855 GHz                    
-     6,299,681,276      instructions                     #    2.53  insn per cycle         
-       0.873227215 seconds time elapsed
+TOTAL       :     0.874979 sec
+     2,489,876,695      cycles                           #    2.834 GHz                    
+     6,298,919,091      instructions                     #    2.53  insn per cycle         
+       0.879145628 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.509691e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.515612e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.515612e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.492404e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.498044e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.498044e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.094413 sec
-     2,048,957,877      cycles                           #    1.866 GHz                    
-     3,269,073,408      instructions                     #    1.60  insn per cycle         
-       1.098654820 seconds time elapsed
+TOTAL       :     1.107211 sec
+     2,056,905,721      cycles                           #    1.852 GHz                    
+     3,268,863,177      instructions                     #    1.59  insn per cycle         
+       1.111361855 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
index 5a5ccf0962..7ef08eb1a1 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:38:03
+DATE: 2023-11-08_21:54:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.621379e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.322960e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.322960e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.630785e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.310772e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.310772e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.467719 sec
-     2,022,012,389      cycles                           #    2.930 GHz                    
-     3,029,595,627      instructions                     #    1.50  insn per cycle         
-       0.748028952 seconds time elapsed
+TOTAL       :     0.466074 sec
+     1,998,575,796      cycles                           #    2.933 GHz                    
+     2,994,965,957      instructions                     #    1.50  insn per cycle         
+       0.738328183 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.232227e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.472561e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.472561e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.261662e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.481805e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.481805e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.641710e+00 +- 4.994249e+00 )  GeV^-4
-TOTAL       :     1.900347 sec
-     6,375,786,665      cycles                           #    2.982 GHz                    
-    13,373,135,596      instructions                     #    2.10  insn per cycle         
-       2.195039568 seconds time elapsed
+TOTAL       :     1.889932 sec
+     6,363,844,307      cycles                           #    2.984 GHz                    
+    13,005,964,068      instructions                     #    2.04  insn per cycle         
+       2.191280597 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.008350e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.009347e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.009347e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.002381e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.003373e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.003373e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.176665 sec
-    24,649,325,474      cycles                           #    3.013 GHz                    
-    78,138,045,806      instructions                     #    3.17  insn per cycle         
-       8.180908705 seconds time elapsed
+TOTAL       :     8.200661 sec
+    24,662,776,052      cycles                           #    3.006 GHz                    
+    78,138,608,532      instructions                     #    3.17  insn per cycle         
+       8.204934256 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.326247e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.339746e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.339746e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.306848e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.320652e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.320652e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.249404 sec
-     6,483,421,678      cycles                           #    2.878 GHz                    
-    20,133,640,820      instructions                     #    3.11  insn per cycle         
-       2.253658931 seconds time elapsed
+TOTAL       :     2.255006 sec
+     6,482,848,456      cycles                           #    2.870 GHz                    
+    20,133,573,977      instructions                     #    3.11  insn per cycle         
+       2.259320427 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.657895e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.664866e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.664866e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.648854e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.655690e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.655690e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.999874 sec
-     2,846,897,865      cycles                           #    2.837 GHz                    
-     7,001,448,108      instructions                     #    2.46  insn per cycle         
-       1.004235579 seconds time elapsed
+TOTAL       :     1.005313 sec
+     2,849,286,060      cycles                           #    2.824 GHz                    
+     7,001,856,779      instructions                     #    2.46  insn per cycle         
+       1.009712120 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.899947e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.909346e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.909346e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.888498e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.898036e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.898036e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.873710 sec
-     2,498,501,131      cycles                           #    2.848 GHz                    
-     6,308,536,459      instructions                     #    2.52  insn per cycle         
-       0.877964105 seconds time elapsed
+TOTAL       :     0.879137 sec
+     2,499,075,063      cycles                           #    2.831 GHz                    
+     6,309,019,763      instructions                     #    2.52  insn per cycle         
+       0.883537991 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.494285e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.499863e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.499863e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.493195e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.498802e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.498802e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.108704 sec
-     2,059,473,334      cycles                           #    1.852 GHz                    
-     3,279,338,884      instructions                     #    1.59  insn per cycle         
-       1.113120539 seconds time elapsed
+TOTAL       :     1.109448 sec
+     2,060,050,205      cycles                           #    1.851 GHz                    
+     3,279,571,633      instructions                     #    1.59  insn per cycle         
+       1.113744599 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
index 12ad22d5a3..4d664fc4d6 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:49:50
+DATE: 2023-11-08_22:06:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.340393e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.392051e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.397944e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.355118e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.403966e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.409182e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.159397e-01 +- 3.238804e-01 )  GeV^-4
-TOTAL       :     0.462195 sec
-     1,986,930,742      cycles                           #    2.947 GHz                    
-     3,005,964,493      instructions                     #    1.51  insn per cycle         
-       0.730831332 seconds time elapsed
+TOTAL       :     0.462753 sec
+     2,014,223,981      cycles                           #    2.997 GHz                    
+     3,038,538,632      instructions                     #    1.51  insn per cycle         
+       0.729935758 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.547500e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.620827e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.624055e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.565713e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.634653e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.637720e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.094367e+02 +- 1.071509e+02 )  GeV^-4
-TOTAL       :     1.798053 sec
-     6,062,916,752      cycles                           #    2.993 GHz                    
-    11,569,516,184      instructions                     #    1.91  insn per cycle         
-       2.082278895 seconds time elapsed
+TOTAL       :     1.796992 sec
+     6,172,975,790      cycles                           #    3.046 GHz                    
+    13,083,554,495      instructions                     #    2.12  insn per cycle         
+       2.086223865 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.005661e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.006690e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.006690e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.045430e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.046401e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.046401e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208459e-01 +- 3.253446e-01 )  GeV^-4
-TOTAL       :     8.186459 sec
-    24,671,953,454      cycles                           #    3.013 GHz                    
-    78,137,621,710      instructions                     #    3.17  insn per cycle         
-       8.190517160 seconds time elapsed
+TOTAL       :     8.027601 sec
+    24,633,930,216      cycles                           #    3.068 GHz                    
+    78,134,736,063      instructions                     #    3.17  insn per cycle         
+       8.031555788 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.107458e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.120841e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.120841e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.461058e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.474893e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.474893e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208457e-01 +- 3.253445e-01 )  GeV^-4
-TOTAL       :     2.317106 sec
-     6,488,771,451      cycles                           #    2.796 GHz                    
-    20,124,539,496      instructions                     #    3.10  insn per cycle         
-       2.321142527 seconds time elapsed
+TOTAL       :     2.206755 sec
+     6,481,821,994      cycles                           #    2.933 GHz                    
+    20,123,351,594      instructions                     #    3.10  insn per cycle         
+       2.210721958 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.647793e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.654673e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.654673e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.665888e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.672800e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.672800e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     1.005506 sec
-     2,843,966,049      cycles                           #    2.818 GHz                    
-     6,991,496,346      instructions                     #    2.46  insn per cycle         
-       1.009548479 seconds time elapsed
+TOTAL       :     0.994258 sec
+     2,841,630,041      cycles                           #    2.848 GHz                    
+     6,990,811,149      instructions                     #    2.46  insn per cycle         
+       0.998209890 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.895349e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.904605e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.904605e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.891296e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.900721e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.900721e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     0.875040 sec
-     2,495,845,822      cycles                           #    2.841 GHz                    
-     6,297,369,404      instructions                     #    2.52  insn per cycle         
-       0.879134455 seconds time elapsed
+TOTAL       :     0.876906 sec
+     2,495,700,726      cycles                           #    2.835 GHz                    
+     6,297,076,618      instructions                     #    2.52  insn per cycle         
+       0.880948978 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.504042e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.510113e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.510113e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.552086e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.558027e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.558027e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214981e-01 +- 3.255523e-01 )  GeV^-4
-TOTAL       :     1.099941 sec
-     2,050,409,457      cycles                           #    1.858 GHz                    
-     3,265,015,309      instructions                     #    1.59  insn per cycle         
-       1.104007255 seconds time elapsed
+TOTAL       :     1.065622 sec
+     2,049,379,894      cycles                           #    1.917 GHz                    
+     3,265,032,857      instructions                     #    1.59  insn per cycle         
+       1.069477010 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
index 5b13ff9774..ee315233c1 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:46:29
+DATE: 2023-11-08_22:02:50
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.339869e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.391844e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.397472e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.328542e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.377951e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.383103e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.461224 sec
-     1,973,907,563      cycles                           #    2.940 GHz                    
-     2,969,869,707      instructions                     #    1.50  insn per cycle         
-       0.729741448 seconds time elapsed
+TOTAL       :     0.460444 sec
+     2,025,388,470      cycles                           #    3.016 GHz                    
+     3,026,490,924      instructions                     #    1.49  insn per cycle         
+       0.728886791 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.563612e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.637504e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.640751e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.561347e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.630349e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.633332e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.749654 sec
-     5,928,680,592      cycles                           #    2.999 GHz                    
-    12,893,930,524      instructions                     #    2.17  insn per cycle         
-       2.033490620 seconds time elapsed
+TOTAL       :     1.742876 sec
+     6,025,656,195      cycles                           #    3.063 GHz                    
+    13,153,972,386      instructions                     #    2.18  insn per cycle         
+       2.023922647 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.014770e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.015759e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.015759e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.049881e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.050905e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.050905e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.147957 sec
-    24,635,567,678      cycles                           #    3.022 GHz                    
-    78,133,891,626      instructions                     #    3.17  insn per cycle         
-       8.152140443 seconds time elapsed
+TOTAL       :     8.008804 sec
+    24,622,379,845      cycles                           #    3.073 GHz                    
+    78,134,077,156      instructions                     #    3.17  insn per cycle         
+       8.012721206 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.062428e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.074909e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.074909e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.445321e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.458917e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.458917e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.330015 sec
-     6,475,827,642      cycles                           #    2.775 GHz                    
-    20,124,634,132      instructions                     #    3.11  insn per cycle         
-       2.334037311 seconds time elapsed
+TOTAL       :     2.210782 sec
+     6,475,852,782      cycles                           #    2.925 GHz                    
+    20,124,175,553      instructions                     #    3.11  insn per cycle         
+       2.214842110 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.595519e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.602006e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.602006e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.697514e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.704851e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.704851e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.036160 sec
-     2,838,919,957      cycles                           #    2.730 GHz                    
-     6,991,694,320      instructions                     #    2.46  insn per cycle         
-       1.040335460 seconds time elapsed
+TOTAL       :     0.973836 sec
+     2,835,149,001      cycles                           #    2.901 GHz                    
+     6,991,410,852      instructions                     #    2.47  insn per cycle         
+       0.977864307 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.893954e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.903085e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.903085e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.934817e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.944385e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.944385e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.873924 sec
-     2,489,283,092      cycles                           #    2.837 GHz                    
-     6,298,948,511      instructions                     #    2.53  insn per cycle         
-       0.878050091 seconds time elapsed
+TOTAL       :     0.855140 sec
+     2,487,419,693      cycles                           #    2.897 GHz                    
+     6,298,706,089      instructions                     #    2.53  insn per cycle         
+       0.859052723 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.497242e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.502884e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.502884e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.555511e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.561377e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.561377e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.103482 sec
-     2,049,248,209      cycles                           #    1.852 GHz                    
-     3,268,952,113      instructions                     #    1.60  insn per cycle         
-       1.107551558 seconds time elapsed
+TOTAL       :     1.062258 sec
+     2,048,558,209      cycles                           #    1.923 GHz                    
+     3,268,764,234      instructions                     #    1.60  insn per cycle         
+       1.066272803 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
index cdb252ac3a..efdbcfe1ae 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:43:11
+DATE: 2023-11-08_21:59:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.764175e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.406414e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.411755e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.758974e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.368878e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.373916e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.466281 sec
-     1,989,064,547      cycles                           #    2.930 GHz                    
-     3,017,212,928      instructions                     #    1.52  insn per cycle         
-       0.737783039 seconds time elapsed
+TOTAL       :     0.462598 sec
+     2,002,083,704      cycles                           #    2.975 GHz                    
+     3,028,559,110      instructions                     #    1.51  insn per cycle         
+       0.730010364 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.472408e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.626435e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.629621e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.506677e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.634226e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.637242e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.641710e+00 +- 4.994249e+00 )  GeV^-4
-TOTAL       :     1.825252 sec
-     6,129,357,136      cycles                           #    2.985 GHz                    
-    13,024,512,874      instructions                     #    2.12  insn per cycle         
-       2.110041533 seconds time elapsed
+TOTAL       :     1.818599 sec
+     6,254,092,117      cycles                           #    3.058 GHz                    
+    12,631,559,563      instructions                     #    2.02  insn per cycle         
+       2.110653596 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -94,14 +94,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.017146e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.018188e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.018188e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.065897e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.066912e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.066912e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.138680 sec
-    24,636,857,889      cycles                           #    3.027 GHz                    
-    78,136,646,989      instructions                     #    3.17  insn per cycle         
-       8.142807331 seconds time elapsed
+TOTAL       :     7.946634 sec
+    24,618,185,681      cycles                           #    3.097 GHz                    
+    78,133,594,453      instructions                     #    3.17  insn per cycle         
+       7.950536612 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -121,14 +121,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.266088e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.280126e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.280126e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.469422e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.483642e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.483642e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.265043 sec
-     6,477,387,096      cycles                           #    2.855 GHz                    
-    20,124,193,083      instructions                     #    3.11  insn per cycle         
-       2.269259910 seconds time elapsed
+TOTAL       :     2.203521 sec
+     6,477,304,059      cycles                           #    2.935 GHz                    
+    20,124,231,259      instructions                     #    3.11  insn per cycle         
+       2.207560981 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -148,14 +148,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.644884e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.651718e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.651718e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.692268e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.699231e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.699231e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.005131 sec
-     2,839,448,871      cycles                           #    2.816 GHz                    
-     6,991,884,623      instructions                     #    2.46  insn per cycle         
-       1.009345557 seconds time elapsed
+TOTAL       :     0.976738 sec
+     2,836,504,426      cycles                           #    2.894 GHz                    
+     6,991,415,909      instructions                     #    2.46  insn per cycle         
+       0.980720950 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -175,14 +175,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.866159e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.874920e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.874920e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.804638e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.812886e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.812886e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.887083 sec
-     2,489,977,422      cycles                           #    2.796 GHz                    
-     6,298,695,060      instructions                     #    2.53  insn per cycle         
-       0.891225776 seconds time elapsed
+TOTAL       :     0.917702 sec
+     2,493,684,467      cycles                           #    2.707 GHz                    
+     6,299,926,195      instructions                     #    2.53  insn per cycle         
+       0.922017124 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -202,14 +202,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.498745e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.504407e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.504407e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.542695e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.548647e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.548647e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.102380 sec
-     2,046,697,565      cycles                           #    1.851 GHz                    
-     3,268,682,926      instructions                     #    1.60  insn per cycle         
-       1.106464577 seconds time elapsed
+TOTAL       :     1.070949 sec
+     2,049,167,689      cycles                           #    1.907 GHz                    
+     3,268,610,487      instructions                     #    1.60  insn per cycle         
+       1.074921168 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
index 9fe77f3bb4..afc8dc6250 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:09:27
+DATE: 2023-11-08_21:24:23
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.327293e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.373619e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.378917e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.334864e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.384627e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.390200e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.480093 sec
-     2,046,992,398      cycles                           #    2.957 GHz                    
-     3,008,261,627      instructions                     #    1.47  insn per cycle         
-       0.750577809 seconds time elapsed
+TOTAL       :     0.478485 sec
+     2,037,012,252      cycles                           #    2.938 GHz                    
+     3,030,553,414      instructions                     #    1.49  insn per cycle         
+       0.751162438 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.515177e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.572348e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.574911e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.576633e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.638822e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.641657e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.715757 sec
-     5,871,373,370      cycles                           #    3.006 GHz                    
-    12,204,738,560      instructions                     #    2.08  insn per cycle         
-       2.009775672 seconds time elapsed
+TOTAL       :     1.723660 sec
+     5,841,021,027      cycles                           #    2.992 GHz                    
+    11,140,232,262      instructions                     #    1.91  insn per cycle         
+       2.010396879 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.026797e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.027818e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.027818e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.020250e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.021294e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.021294e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.100053 sec
-    24,563,227,881      cycles                           #    3.031 GHz                    
-    77,860,200,084      instructions                     #    3.17  insn per cycle         
-       8.104232064 seconds time elapsed
+TOTAL       :     8.126388 sec
+    24,531,986,763      cycles                           #    3.018 GHz                    
+    77,860,700,825      instructions                     #    3.17  insn per cycle         
+       8.130365170 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3113) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.430084e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.444359e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.444359e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.508420e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.523945e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.523945e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.215968 sec
-     6,421,588,621      cycles                           #    2.894 GHz                    
-    20,090,220,099      instructions                     #    3.13  insn per cycle         
-       2.220335001 seconds time elapsed
+TOTAL       :     2.192196 sec
+     6,417,749,314      cycles                           #    2.923 GHz                    
+    20,089,444,717      instructions                     #    3.13  insn per cycle         
+       2.196603069 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13452) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.625861e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.632520e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.632520e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.619246e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.625936e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.625936e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.016598 sec
-     2,906,571,537      cycles                           #    2.849 GHz                    
-     7,134,546,428      instructions                     #    2.45  insn per cycle         
-       1.020819368 seconds time elapsed
+TOTAL       :     1.020667 sec
+     2,904,857,639      cycles                           #    2.836 GHz                    
+     7,133,491,112      instructions                     #    2.46  insn per cycle         
+       1.024733034 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:12261) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.810175e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.818358e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.818358e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.807219e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.815471e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.815471e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.914087 sec
-     2,595,791,217      cycles                           #    2.828 GHz                    
-     6,442,852,611      instructions                     #    2.48  insn per cycle         
-       0.918452804 seconds time elapsed
+TOTAL       :     0.915311 sec
+     2,597,440,177      cycles                           #    2.827 GHz                    
+     6,442,073,160      instructions                     #    2.48  insn per cycle         
+       0.919440444 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11276) (512y:   27) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.453251e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.458727e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.458727e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.330502e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.335014e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.335014e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.136514 sec
-     2,124,554,510      cycles                           #    1.864 GHz                    
-     3,431,456,558      instructions                     #    1.62  insn per cycle         
-       1.140688320 seconds time elapsed
+TOTAL       :     1.241025 sec
+     2,122,770,451      cycles                           #    1.706 GHz                    
+     3,430,866,539      instructions                     #    1.62  insn per cycle         
+       1.245371552 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2912) (512y:   22) (512z: 9647)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
index 6d22eac4d2..86542f0b70 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:29:17
+DATE: 2023-11-08_21:45:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.584275e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.627587e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.631963e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.570490e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.610069e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.614296e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.489018 sec
-     2,081,067,674      cycles                           #    2.934 GHz                    
-     3,133,776,802      instructions                     #    1.51  insn per cycle         
-       0.771988427 seconds time elapsed
+TOTAL       :     0.491125 sec
+     2,098,886,797      cycles                           #    2.948 GHz                    
+     3,121,764,784      instructions                     #    1.49  insn per cycle         
+       0.773983413 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.747350e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.808169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.810857e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.716470e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.775515e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.778049e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.853996 sec
-     6,275,481,753      cycles                           #    3.001 GHz                    
-    12,514,155,894      instructions                     #    1.99  insn per cycle         
-       2.147936222 seconds time elapsed
+TOTAL       :     1.856510 sec
+     6,241,842,396      cycles                           #    2.982 GHz                    
+    13,362,161,836      instructions                     #    2.14  insn per cycle         
+       2.150637345 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.644036e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.644860e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.644860e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.736455e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.737287e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.737287e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    29.065327 sec
-    87,424,924,787      cycles                           #    3.008 GHz                    
-   135,567,300,472      instructions                     #    1.55  insn per cycle         
-      29.069446346 seconds time elapsed
+TOTAL       :    28.600113 sec
+    86,425,718,035      cycles                           #    3.022 GHz                    
+   135,574,556,258      instructions                     #    1.57  insn per cycle         
+      28.604413837 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:15486) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.026233e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.038857e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.038857e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.030289e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.043211e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.043211e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.342565 sec
-     6,786,587,363      cycles                           #    2.893 GHz                    
-    19,387,387,931      instructions                     #    2.86  insn per cycle         
-       2.346831164 seconds time elapsed
+TOTAL       :     2.341197 sec
+     6,779,953,097      cycles                           #    2.892 GHz                    
+    19,387,529,866      instructions                     #    2.86  insn per cycle         
+       2.345543121 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:69680) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.459444e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.464900e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.464900e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.479111e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.484786e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.484786e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     1.132478 sec
-     3,179,013,562      cycles                           #    2.798 GHz                    
-     6,809,043,401      instructions                     #    2.14  insn per cycle         
-       1.136902959 seconds time elapsed
+TOTAL       :     1.117197 sec
+     3,179,595,887      cycles                           #    2.837 GHz                    
+     6,808,760,792      instructions                     #    2.14  insn per cycle         
+       1.121370768 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:49077) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.738168e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.745907e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.745907e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.783416e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.791440e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.791440e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     0.952016 sec
-     2,651,392,730      cycles                           #    2.774 GHz                    
-     5,987,188,755      instructions                     #    2.26  insn per cycle         
-       0.956397839 seconds time elapsed
+TOTAL       :     0.927417 sec
+     2,649,120,857      cycles                           #    2.846 GHz                    
+     5,987,099,017      instructions                     #    2.26  insn per cycle         
+       0.931540821 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:42677) (512y:   11) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.472802e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.478184e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.478184e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.490502e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.495988e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.495988e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060904e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.121995 sec
-     2,073,738,270      cycles                           #    1.843 GHz                    
-     3,501,511,021      instructions                     #    1.69  insn per cycle         
-       1.126283052 seconds time elapsed
+TOTAL       :     1.108557 sec
+     2,075,562,698      cycles                           #    1.867 GHz                    
+     3,501,563,321      instructions                     #    1.69  insn per cycle         
+       1.112823809 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 5198) (512y:    3) (512z:44822)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
index 5c9ad24a46..4737cdf8e3 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:30:09
+DATE: 2023-11-08_21:46:27
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.558233e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.598421e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.603327e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.528505e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.572699e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.577185e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.487345 sec
-     2,076,063,570      cycles                           #    2.928 GHz                    
-     3,124,474,063      instructions                     #    1.50  insn per cycle         
-       0.769324674 seconds time elapsed
+TOTAL       :     0.485528 sec
+     2,086,161,680      cycles                           #    2.950 GHz                    
+     3,149,356,396      instructions                     #    1.51  insn per cycle         
+       0.766853446 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.647182e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.706650e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.709351e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.640879e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.699452e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.702171e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.865548 sec
-     6,314,327,402      cycles                           #    2.992 GHz                    
-    13,540,816,282      instructions                     #    2.14  insn per cycle         
-       2.170188129 seconds time elapsed
+TOTAL       :     1.863196 sec
+     6,301,645,470      cycles                           #    3.002 GHz                    
+    12,163,417,933      instructions                     #    1.93  insn per cycle         
+       2.157068829 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.736423e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.737265e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.737265e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.763152e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.763992e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.763992e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    28.597065 sec
-    86,035,998,776      cycles                           #    3.009 GHz                    
-   135,911,265,736      instructions                     #    1.58  insn per cycle         
-      28.601145029 seconds time elapsed
+TOTAL       :    28.466782 sec
+    86,160,161,464      cycles                           #    3.027 GHz                    
+   135,907,402,983      instructions                     #    1.58  insn per cycle         
+      28.470931551 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:15910) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.976771e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.989628e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.989628e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.954712e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.967174e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.967174e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.358802 sec
-     6,848,676,061      cycles                           #    2.899 GHz                    
-    19,439,456,701      instructions                     #    2.84  insn per cycle         
-       2.362995374 seconds time elapsed
+TOTAL       :     2.366132 sec
+     6,848,483,827      cycles                           #    2.890 GHz                    
+    19,440,750,063      instructions                     #    2.84  insn per cycle         
+       2.370332980 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:69722) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.510619e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.516450e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.516450e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.511072e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.516863e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.516863e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     1.093889 sec
-     3,110,977,160      cycles                           #    2.835 GHz                    
-     6,719,869,092      instructions                     #    2.16  insn per cycle         
-       1.098127483 seconds time elapsed
+TOTAL       :     1.093285 sec
+     3,106,954,835      cycles                           #    2.833 GHz                    
+     6,720,019,206      instructions                     #    2.16  insn per cycle         
+       1.097556495 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:47667) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.794946e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.802956e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.802956e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.791720e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.799978e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.799978e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     0.922821 sec
-     2,627,235,427      cycles                           #    2.838 GHz                    
-     5,970,250,488      instructions                     #    2.27  insn per cycle         
-       0.926978795 seconds time elapsed
+TOTAL       :     0.924560 sec
+     2,625,881,689      cycles                           #    2.831 GHz                    
+     5,970,468,600      instructions                     #    2.27  insn per cycle         
+       0.928699193 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:41842) (512y:   13) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.483560e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.489106e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.489106e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.485772e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.491338e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.491338e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060904e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.114177 sec
-     2,080,137,201      cycles                           #    1.861 GHz                    
-     3,494,948,543      instructions                     #    1.68  insn per cycle         
-       1.118521627 seconds time elapsed
+TOTAL       :     1.112143 sec
+     2,079,682,688      cycles                           #    1.864 GHz                    
+     3,494,926,799      instructions                     #    1.68  insn per cycle         
+       1.116310984 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4162) (512y:    4) (512z:44465)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index b38c13fcd9..0d88057431 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:09:56
+DATE: 2023-11-08_21:24:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.468828e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.491770e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.493892e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.461953e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.486921e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.488984e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.519876 sec
-     2,215,127,737      cycles                           #    2.957 GHz                    
-     3,487,212,374      instructions                     #    1.57  insn per cycle         
-       0.807913712 seconds time elapsed
+TOTAL       :     0.524274 sec
+     2,213,988,684      cycles                           #    2.939 GHz                    
+     3,460,274,141      instructions                     #    1.56  insn per cycle         
+       0.814878779 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.135164e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.161799e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.162966e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.131317e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.159899e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.161114e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.028303 sec
-     9,769,796,186      cycles                           #    2.979 GHz                    
-    22,335,132,843      instructions                     #    2.29  insn per cycle         
-       3.336784998 seconds time elapsed
+TOTAL       :     3.024560 sec
+     9,783,019,983      cycles                           #    2.986 GHz                    
+    21,052,355,005      instructions                     #    2.15  insn per cycle         
+       3.333798384 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.912244e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.913140e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.913140e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.908400e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.909295e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.909295e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.586099 sec
-    25,914,180,302      cycles                           #    3.017 GHz                    
-    79,445,505,152      instructions                     #    3.07  insn per cycle         
-       8.590406292 seconds time elapsed
+TOTAL       :     8.603127 sec
+    25,922,951,314      cycles                           #    3.012 GHz                    
+    79,444,287,848      instructions                     #    3.06  insn per cycle         
+       8.607377110 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4857) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.695684e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.699049e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.699049e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.601676e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.605199e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.605199e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.447189 sec
-    12,656,450,439      cycles                           #    2.844 GHz                    
-    38,554,825,829      instructions                     #    3.05  insn per cycle         
-       4.451478069 seconds time elapsed
+TOTAL       :     4.563626 sec
+    12,670,494,381      cycles                           #    2.774 GHz                    
+    38,555,115,428      instructions                     #    3.04  insn per cycle         
+       4.567958025 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13161) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.537952e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.556620e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.556620e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.436133e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.453065e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.453065e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.930375 sec
-     5,512,214,802      cycles                           #    2.850 GHz                    
-    13,486,265,307      instructions                     #    2.45  insn per cycle         
-       1.934770358 seconds time elapsed
+TOTAL       :     1.953575 sec
+     5,515,640,809      cycles                           #    2.818 GHz                    
+    13,484,131,277      instructions                     #    2.44  insn per cycle         
+       1.957940467 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11242) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.638550e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.660856e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.660856e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.530089e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.553433e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.553433e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.711054 sec
-     4,872,445,248      cycles                           #    2.842 GHz                    
-    12,141,983,198      instructions                     #    2.49  insn per cycle         
-       1.715434660 seconds time elapsed
+TOTAL       :     1.730211 sec
+     4,882,100,767      cycles                           #    2.816 GHz                    
+    12,140,913,078      instructions                     #    2.49  insn per cycle         
+       1.734496344 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10154) (512y:   79) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.406789e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.420159e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.420159e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.332978e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.346275e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.346275e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.223975 sec
-     4,144,217,356      cycles                           #    1.862 GHz                    
-     6,340,578,545      instructions                     #    1.53  insn per cycle         
-       2.228285470 seconds time elapsed
+TOTAL       :     2.246181 sec
+     4,144,338,295      cycles                           #    1.842 GHz                    
+     6,339,235,304      instructions                     #    1.53  insn per cycle         
+       2.250536993 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1802) (512y:   93) (512z: 9358)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
index 46f37c0a90..154c33870f 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-03_19:10:33
+DATE: 2023-11-08_21:25:29
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.484364e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.507714e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.509764e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.466139e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.491413e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.493568e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.519968 sec
-     2,216,873,411      cycles                           #    2.952 GHz                    
-     3,459,675,597      instructions                     #    1.56  insn per cycle         
-       0.809738739 seconds time elapsed
+TOTAL       :     0.523033 sec
+     2,231,792,351      cycles                           #    2.947 GHz                    
+     3,493,743,246      instructions                     #    1.57  insn per cycle         
+       0.817222718 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.134555e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.161246e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.162402e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.134865e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.163582e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.164827e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.016486 sec
-     9,822,814,204      cycles                           #    3.004 GHz                    
-    22,339,986,571      instructions                     #    2.27  insn per cycle         
-       3.325238208 seconds time elapsed
+TOTAL       :     3.022684 sec
+     9,525,982,822      cycles                           #    2.907 GHz                    
+    21,759,904,749      instructions                     #    2.28  insn per cycle         
+       3.333718015 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.909809e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.910727e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.910727e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.890125e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.891036e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.891036e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.597381 sec
-    25,939,435,501      cycles                           #    3.017 GHz                    
-    79,457,351,519      instructions                     #    3.06  insn per cycle         
-       8.601657625 seconds time elapsed
+TOTAL       :     8.687196 sec
+    25,936,497,205      cycles                           #    2.985 GHz                    
+    79,455,431,598      instructions                     #    3.06  insn per cycle         
+       8.691442955 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4504) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.664829e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.668218e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.668218e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.674580e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.678053e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.678053e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.484461 sec
-    12,651,418,370      cycles                           #    2.819 GHz                    
-    38,525,727,884      instructions                     #    3.05  insn per cycle         
-       4.488762135 seconds time elapsed
+TOTAL       :     4.473580 sec
+    12,663,684,829      cycles                           #    2.829 GHz                    
+    38,526,072,859      instructions                     #    3.04  insn per cycle         
+       4.477928329 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:12928) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.385701e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.404187e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.404187e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.447225e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.464376e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.464376e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.965077 sec
-     5,557,225,506      cycles                           #    2.823 GHz                    
-    13,610,780,927      instructions                     #    2.45  insn per cycle         
-       1.969439061 seconds time elapsed
+TOTAL       :     1.950551 sec
+     5,554,043,311      cycles                           #    2.842 GHz                    
+    13,609,444,575      instructions                     #    2.45  insn per cycle         
+       1.954818500 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11327) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.328216e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.349743e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.349743e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.528912e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.551046e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.551046e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.767465 sec
-     4,920,931,185      cycles                           #    2.779 GHz                    
-    12,278,542,674      instructions                     #    2.50  insn per cycle         
-       1.771926617 seconds time elapsed
+TOTAL       :     1.730043 sec
+     4,918,299,350      cycles                           #    2.837 GHz                    
+    12,276,281,852      instructions                     #    2.50  insn per cycle         
+       1.734286887 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10143) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.389874e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.403004e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.403004e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.227160e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.239598e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.239598e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.228912 sec
-     4,146,930,402      cycles                           #    1.858 GHz                    
-     6,446,453,346      instructions                     #    1.55  insn per cycle         
-       2.233245374 seconds time elapsed
+TOTAL       :     2.278650 sec
+     4,148,690,065      cycles                           #    1.818 GHz                    
+     6,446,007,726      instructions                     #    1.55  insn per cycle         
+       2.282996103 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1627) (512y:  191) (512z: 9356)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 2048a9698e..f7c4424904 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:12:52
+DATE: 2023-11-08_21:27:51
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.071850e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.072225e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.072335e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.070515e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.070905e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.071008e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.421447 sec
-     8,245,731,454      cycles                           #    3.012 GHz                    
-    18,688,279,165      instructions                     #    2.27  insn per cycle         
-       2.797097094 seconds time elapsed
+TOTAL       :     2.420963 sec
+     8,223,258,722      cycles                           #    3.000 GHz                    
+    17,670,197,130      instructions                     #    2.15  insn per cycle         
+       2.797812392 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.261920e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.263777e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.264034e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.267469e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.269461e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.269740e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.993277 sec
-    12,924,149,664      cycles                           #    2.993 GHz                    
-    29,920,520,122      instructions                     #    2.32  insn per cycle         
-       4.373104302 seconds time elapsed
+TOTAL       :     3.983357 sec
+    12,890,762,548      cycles                           #    2.986 GHz                    
+    28,149,713,448      instructions                     #    2.18  insn per cycle         
+       4.374511500 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.414546e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.414780e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.414780e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.327736e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.327962e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.327962e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.278557 sec
-    18,784,400,880      cycles                           #    2.990 GHz                    
-    53,915,743,321      instructions                     #    2.87  insn per cycle         
-       6.282578284 seconds time elapsed
+TOTAL       :     6.345395 sec
+    18,808,426,055      cycles                           #    2.963 GHz                    
+    53,915,859,593      instructions                     #    2.87  insn per cycle         
+       6.349306785 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32447) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.622225e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.622313e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.622313e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.631387e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.631477e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.631477e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.260349 sec
-     9,843,353,366      cycles                           #    3.016 GHz                    
-    27,093,120,012      instructions                     #    2.75  insn per cycle         
-       3.264542212 seconds time elapsed
+TOTAL       :     3.247242 sec
+     9,798,431,936      cycles                           #    3.015 GHz                    
+    27,093,078,884      instructions                     #    2.77  insn per cycle         
+       3.251306892 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96441) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.543297e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.543763e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.543763e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.527269e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.527671e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.527671e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.494911 sec
-     4,247,565,583      cycles                           #    2.835 GHz                    
-     9,561,660,282      instructions                     #    2.25  insn per cycle         
-       1.498994646 seconds time elapsed
+TOTAL       :     1.502062 sec
+     4,254,510,227      cycles                           #    2.826 GHz                    
+     9,561,365,042      instructions                     #    2.25  insn per cycle         
+       1.506086006 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84390) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.041064e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.041630e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.041630e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.044745e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.045315e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.045315e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.312043 sec
-     3,711,873,932      cycles                           #    2.822 GHz                    
-     8,485,580,977      instructions                     #    2.29  insn per cycle         
-       1.316064551 seconds time elapsed
+TOTAL       :     1.310362 sec
+     3,714,842,589      cycles                           #    2.828 GHz                    
+     8,485,417,237      instructions                     #    2.28  insn per cycle         
+       1.314439582 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79991) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.655846e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.656376e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.656376e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.650927e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.651448e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.651448e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.450066 sec
-     2,692,078,825      cycles                           #    1.852 GHz                    
-     4,273,245,565      instructions                     #    1.59  insn per cycle         
-       1.454158841 seconds time elapsed
+TOTAL       :     1.452786 sec
+     2,695,403,304      cycles                           #    1.852 GHz                    
+     4,273,125,151      instructions                     #    1.59  insn per cycle         
+       1.456779010 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2284) (512y:  105) (512z:79105)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
index fbbae31086..f73b319e4d 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:38:33
+DATE: 2023-11-08_21:55:03
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.071334e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.072304e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.072304e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.070004e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.071005e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.071005e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.376547 sec
-     8,066,576,997      cycles                           #    2.992 GHz                    
-    17,224,378,863      instructions                     #    2.14  insn per cycle         
-       2.753340167 seconds time elapsed
+TOTAL       :     2.374046 sec
+     8,061,206,235      cycles                           #    2.993 GHz                    
+    17,860,181,288      instructions                     #    2.22  insn per cycle         
+       2.750065172 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.219956e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.252584e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.252584e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.226901e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.259810e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.259810e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.983566 sec
-    12,755,700,095      cycles                           #    2.969 GHz                    
-    26,780,853,821      instructions                     #    2.10  insn per cycle         
-       4.362214203 seconds time elapsed
+TOTAL       :     3.996223 sec
+    12,903,615,719      cycles                           #    2.989 GHz                    
+    27,064,646,353      instructions                     #    2.10  insn per cycle         
+       4.375939404 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.520548e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.520796e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.520796e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.320809e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.321082e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.321082e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.203566 sec
-    18,829,459,475      cycles                           #    3.034 GHz                    
-    53,915,868,697      instructions                     #    2.86  insn per cycle         
-       6.207586404 seconds time elapsed
+TOTAL       :     6.351015 sec
+    18,895,432,596      cycles                           #    2.975 GHz                    
+    53,920,363,469      instructions                     #    2.85  insn per cycle         
+       6.355030283 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32447) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.632618e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.632708e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.632708e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.632581e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.632679e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.632679e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.239187 sec
-     9,805,468,555      cycles                           #    3.024 GHz                    
-    27,094,086,958      instructions                     #    2.76  insn per cycle         
-       3.243245202 seconds time elapsed
+TOTAL       :     3.239771 sec
+     9,805,010,159      cycles                           #    3.023 GHz                    
+    27,094,031,310      instructions                     #    2.76  insn per cycle         
+       3.243901475 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96441) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.541893e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.542348e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.542348e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.542776e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.543249e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.543249e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.496042 sec
-     4,247,154,617      cycles                           #    2.833 GHz                    
-     9,562,315,517      instructions                     #    2.25  insn per cycle         
-       1.500165545 seconds time elapsed
+TOTAL       :     1.496243 sec
+     4,233,173,830      cycles                           #    2.823 GHz                    
+     9,562,510,318      instructions                     #    2.26  insn per cycle         
+       1.500255263 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84390) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.062512e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.063083e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.063083e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.008828e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.009454e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.009454e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.305609 sec
-     3,707,362,205      cycles                           #    2.832 GHz                    
-     8,486,374,508      instructions                     #    2.29  insn per cycle         
-       1.309600698 seconds time elapsed
+TOTAL       :     1.322886 sec
+     3,744,251,192      cycles                           #    2.823 GHz                    
+     8,486,441,130      instructions                     #    2.27  insn per cycle         
+       1.326937869 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79991) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.623189e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.623772e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.623772e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.594601e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.595186e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.595186e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.463361 sec
-     2,697,367,089      cycles                           #    1.839 GHz                    
-     4,274,143,132      instructions                     #    1.58  insn per cycle         
-       1.467446249 seconds time elapsed
+TOTAL       :     1.474018 sec
+     2,696,155,761      cycles                           #    1.825 GHz                    
+     4,274,155,931      instructions                     #    1.59  insn per cycle         
+       1.478064357 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2284) (512y:  105) (512z:79105)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
index c51993cada..7a2b2c0da9 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:13:56
+DATE: 2023-11-08_21:28:55
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.063023e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.063394e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.063534e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.069743e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.070108e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.070239e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.424921 sec
-     8,232,683,158      cycles                           #    2.990 GHz                    
-    17,655,317,796      instructions                     #    2.14  insn per cycle         
-       2.812107310 seconds time elapsed
+TOTAL       :     2.423143 sec
+     8,082,723,641      cycles                           #    2.933 GHz                    
+    18,147,438,278      instructions                     #    2.25  insn per cycle         
+       2.812330272 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.268141e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.269954e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.270195e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.271955e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.273887e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.274124e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.993040 sec
-    12,961,046,511      cycles                           #    3.002 GHz                    
-    29,041,240,897      instructions                     #    2.24  insn per cycle         
-       4.374135451 seconds time elapsed
+TOTAL       :     3.988843 sec
+    13,001,327,656      cycles                           #    3.014 GHz                    
+    27,551,753,777      instructions                     #    2.12  insn per cycle         
+       4.370037996 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.423791e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.424026e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.424026e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.093188e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.093423e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.093423e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.281065 sec
-    18,737,351,960      cycles                           #    2.982 GHz                    
-    53,924,990,961      instructions                     #    2.88  insn per cycle         
-       6.285160496 seconds time elapsed
+TOTAL       :     6.521355 sec
+    18,798,207,330      cycles                           #    2.882 GHz                    
+    53,926,908,182      instructions                     #    2.87  insn per cycle         
+       6.525452544 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32062) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.617244e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.617330e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.617330e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.629486e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.629575e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.629575e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.274940 sec
-     9,810,206,221      cycles                           #    2.993 GHz                    
-    27,090,315,670      instructions                     #    2.76  insn per cycle         
-       3.279033724 seconds time elapsed
+TOTAL       :     3.245853 sec
+     9,848,079,716      cycles                           #    3.031 GHz                    
+    27,090,265,030      instructions                     #    2.75  insn per cycle         
+       3.250037477 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96284) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.504500e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.504945e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.504945e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.490286e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.490752e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.490752e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.511756 sec
-     4,249,692,377      cycles                           #    2.805 GHz                    
-     9,561,658,782      instructions                     #    2.25  insn per cycle         
-       1.515796071 seconds time elapsed
+TOTAL       :     1.517008 sec
+     4,257,648,545      cycles                           #    2.800 GHz                    
+     9,561,344,255      instructions                     #    2.25  insn per cycle         
+       1.521285854 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84478) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.067567e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.068141e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.068141e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.021344e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.021901e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.021901e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.304248 sec
-     3,697,935,435      cycles                           #    2.828 GHz                    
-     8,485,512,243      instructions                     #    2.29  insn per cycle         
-       1.308302011 seconds time elapsed
+TOTAL       :     1.319133 sec
+     3,701,318,743      cycles                           #    2.798 GHz                    
+     8,485,189,781      instructions                     #    2.29  insn per cycle         
+       1.323286884 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:80014) (512y:  241) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.626044e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.626572e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.626572e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.378941e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.379448e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.379448e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.462511 sec
-     2,704,261,685      cycles                           #    1.846 GHz                    
-     4,277,565,036      instructions                     #    1.58  insn per cycle         
-       1.466688212 seconds time elapsed
+TOTAL       :     1.566088 sec
+     2,698,066,709      cycles                           #    1.719 GHz                    
+     4,276,879,461      instructions                     #    1.59  insn per cycle         
+       1.570153625 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2169) (512y:  187) (512z:79110)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index 0a60ba6d62..f4e838f103 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:14:59
+DATE: 2023-11-08_21:29:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.757584e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.758488e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.758845e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.755384e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.756376e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.756775e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.659896 sec
-     5,702,631,198      cycles                           #    2.947 GHz                    
-    11,810,983,379      instructions                     #    2.07  insn per cycle         
-       1.991424837 seconds time elapsed
+TOTAL       :     1.659165 sec
+     5,717,115,891      cycles                           #    2.955 GHz                    
+    12,190,075,892      instructions                     #    2.13  insn per cycle         
+       1.991284959 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.332515e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.333177e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.333265e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.328819e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.329492e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.329584e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333435e-05 )  GeV^-6
-TOTAL       :     1.929687 sec
-     6,546,483,377      cycles                           #    2.952 GHz                    
-    14,155,312,120      instructions                     #    2.16  insn per cycle         
-       2.273547514 seconds time elapsed
+TOTAL       :     1.928671 sec
+     6,641,955,934      cycles                           #    3.003 GHz                    
+    14,330,947,638      instructions                     #    2.16  insn per cycle         
+       2.270510678 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.817807e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.818080e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.818080e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.903818e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.904090e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.904090e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.991502 sec
-    17,897,297,418      cycles                           #    2.986 GHz                    
-    53,590,305,749      instructions                     #    2.99  insn per cycle         
-       5.995609214 seconds time elapsed
+TOTAL       :     5.935151 sec
+    17,988,960,616      cycles                           #    3.029 GHz                    
+    53,590,161,611      instructions                     #    2.98  insn per cycle         
+       5.939109392 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:20207) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.535145e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.535592e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.535592e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.520103e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.520628e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.520628e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.499015 sec
-     4,559,682,745      cycles                           #    3.035 GHz                    
-    13,762,791,022      instructions                     #    3.02  insn per cycle         
-       1.503172123 seconds time elapsed
+TOTAL       :     1.505890 sec
+     4,563,568,647      cycles                           #    3.024 GHz                    
+    13,762,453,321      instructions                     #    3.02  insn per cycle         
+       1.509910484 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96986) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.101340e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.103065e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.103065e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.038019e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.039763e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.039763e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.748885 sec
-     2,136,693,329      cycles                           #    2.841 GHz                    
-     4,817,082,222      instructions                     #    2.25  insn per cycle         
-       0.752876610 seconds time elapsed
+TOTAL       :     0.756034 sec
+     2,141,156,270      cycles                           #    2.820 GHz                    
+     4,816,859,984      instructions                     #    2.25  insn per cycle         
+       0.760083736 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84904) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.112158e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.114365e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.114365e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.079503e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.081743e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.081743e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.656308 sec
-     1,869,942,366      cycles                           #    2.835 GHz                    
-     4,274,318,244      instructions                     #    2.29  insn per cycle         
-       0.660301551 seconds time elapsed
+TOTAL       :     0.658883 sec
+     1,871,387,054      cycles                           #    2.825 GHz                    
+     4,273,792,692      instructions                     #    2.28  insn per cycle         
+       0.663026186 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:80610) (512y:   46) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.296564e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.298817e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.298817e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.037980e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.040224e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.040224e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.728798 sec
-     1,352,736,555      cycles                           #    1.847 GHz                    
-     2,158,877,197      instructions                     #    1.60  insn per cycle         
-       0.732817833 seconds time elapsed
+TOTAL       :     0.756823 sec
+     1,355,166,582      cycles                           #    1.782 GHz                    
+     2,158,764,056      instructions                     #    1.59  insn per cycle         
+       0.760952708 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2878) (512y:   49) (512z:79298)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
index 17034b30a2..6fa929f5b1 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:39:36
+DATE: 2023-11-08_21:56:06
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.806522e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.808414e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.808414e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.804869e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.806749e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.806749e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187094e-05 +- 9.825664e-06 )  GeV^-6
-TOTAL       :     1.595844 sec
-     5,598,060,641      cycles                           #    2.994 GHz                    
-    11,899,085,664      instructions                     #    2.13  insn per cycle         
-       1.927316991 seconds time elapsed
+TOTAL       :     1.602285 sec
+     5,612,741,884      cycles                           #    2.994 GHz                    
+    11,823,721,041      instructions                     #    2.11  insn per cycle         
+       1.932057655 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.306726e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.320071e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.320071e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.321250e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.334433e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.334433e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856441e-04 +- 8.331096e-05 )  GeV^-6
-TOTAL       :     1.896426 sec
-     6,482,998,335      cycles                           #    2.990 GHz                    
-    13,087,346,923      instructions                     #    2.02  insn per cycle         
-       2.228516012 seconds time elapsed
+TOTAL       :     1.875073 sec
+     6,423,111,015      cycles                           #    2.987 GHz                    
+    14,218,262,182      instructions                     #    2.21  insn per cycle         
+       2.206850504 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.982697e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.982966e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.982966e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.905231e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.905509e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.905509e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.882902 sec
-    17,886,003,642      cycles                           #    3.039 GHz                    
-    53,589,820,489      instructions                     #    3.00  insn per cycle         
-       5.886864227 seconds time elapsed
+TOTAL       :     5.938687 sec
+    17,836,764,476      cycles                           #    3.002 GHz                    
+    53,590,153,759      instructions                     #    3.00  insn per cycle         
+       5.942639449 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:20207) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.517559e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.518006e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.518006e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.489420e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.489830e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.489830e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.508545 sec
-     4,560,262,414      cycles                           #    3.016 GHz                    
-    13,763,353,615      instructions                     #    3.02  insn per cycle         
-       1.512732617 seconds time elapsed
+TOTAL       :     1.517580 sec
+     4,611,683,817      cycles                           #    3.032 GHz                    
+    13,763,345,896      instructions                     #    2.98  insn per cycle         
+       1.521625428 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96986) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.047943e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.049624e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.049624e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.247085e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.248950e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.248950e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.755133 sec
-     2,153,006,129      cycles                           #    2.839 GHz                    
-     4,818,213,561      instructions                     #    2.24  insn per cycle         
-       0.759225829 seconds time elapsed
+TOTAL       :     0.733666 sec
+     2,134,815,435      cycles                           #    2.897 GHz                    
+     4,817,815,542      instructions                     #    2.26  insn per cycle         
+       0.737580401 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84904) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.134004e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.136209e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.136209e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.255023e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.257521e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.257521e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.654556 sec
-     1,870,329,136      cycles                           #    2.842 GHz                    
-     4,274,869,931      instructions                     #    2.29  insn per cycle         
-       0.658687365 seconds time elapsed
+TOTAL       :     0.644323 sec
+     1,868,915,722      cycles                           #    2.886 GHz                    
+     4,274,871,857      instructions                     #    2.29  insn per cycle         
+       0.648325497 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:80610) (512y:   46) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.265196e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.267580e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.267580e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.514603e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.516833e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.516833e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.732395 sec
-     1,354,970,411      cycles                           #    1.842 GHz                    
-     2,159,667,135      instructions                     #    1.59  insn per cycle         
-       0.736399157 seconds time elapsed
+TOTAL       :     0.708018 sec
+     1,353,648,095      cycles                           #    1.903 GHz                    
+     2,159,618,866      instructions                     #    1.60  insn per cycle         
+       0.711901071 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2878) (512y:   49) (512z:79298)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
index 9247dc6a21..2b69abf3e0 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:15:46
+DATE: 2023-11-08_21:30:46
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.757824e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.758656e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.758919e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.751553e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.752429e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.752778e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.658123 sec
-     5,805,953,943      cycles                           #    3.008 GHz                    
-    12,018,291,784      instructions                     #    2.07  insn per cycle         
-       1.988767308 seconds time elapsed
+TOTAL       :     1.662264 sec
+     5,791,514,994      cycles                           #    2.989 GHz                    
+    11,290,505,064      instructions                     #    1.95  insn per cycle         
+       1.994487544 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.327280e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.327957e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.328041e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.318654e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.319320e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.319463e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333435e-05 )  GeV^-6
-TOTAL       :     1.929152 sec
-     6,666,976,802      cycles                           #    3.013 GHz                    
-    13,831,721,664      instructions                     #    2.07  insn per cycle         
-       2.269150647 seconds time elapsed
+TOTAL       :     1.936834 sec
+     6,513,518,428      cycles                           #    2.942 GHz                    
+    13,310,876,477      instructions                     #    2.04  insn per cycle         
+       2.270995377 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.798758e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.799028e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.799028e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.877357e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.877629e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.877629e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.002749 sec
-    17,897,748,334      cycles                           #    2.981 GHz                    
-    53,583,210,251      instructions                     #    2.99  insn per cycle         
-       6.006727820 seconds time elapsed
+TOTAL       :     5.953085 sec
+    17,926,444,710      cycles                           #    3.010 GHz                    
+    53,580,674,845      instructions                     #    2.99  insn per cycle         
+       5.957045253 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:20206) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.533102e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.533527e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.533527e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.538806e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.539230e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.539230e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.498905 sec
-     4,550,573,846      cycles                           #    3.029 GHz                    
-    13,756,139,320      instructions                     #    3.02  insn per cycle         
-       1.503009468 seconds time elapsed
+TOTAL       :     1.497134 sec
+     4,549,359,025      cycles                           #    3.032 GHz                    
+    13,755,898,061      instructions                     #    3.02  insn per cycle         
+       1.501295301 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96606) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.049905e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.051589e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.051589e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.000854e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.002553e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.002553e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.753863 sec
-     2,147,980,052      cycles                           #    2.837 GHz                    
-     4,819,413,658      instructions                     #    2.24  insn per cycle         
-       0.757858909 seconds time elapsed
+TOTAL       :     0.759453 sec
+     2,151,217,111      cycles                           #    2.820 GHz                    
+     4,818,966,673      instructions                     #    2.24  insn per cycle         
+       0.763529614 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:85359) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.121398e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.123528e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.123528e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.076028e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.078137e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.078137e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.655569 sec
-     1,875,337,702      cycles                           #    2.847 GHz                    
-     4,276,013,202      instructions                     #    2.28  insn per cycle         
-       0.659452126 seconds time elapsed
+TOTAL       :     0.658855 sec
+     1,875,464,841      cycles                           #    2.832 GHz                    
+     4,275,819,002      instructions                     #    2.28  insn per cycle         
+       0.662852680 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:81075) (512y:   26) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.258028e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.260328e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.260328e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.283691e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.286276e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.286276e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.732438 sec
-     1,358,895,231      cycles                           #    1.851 GHz                    
-     2,165,631,438      instructions                     #    1.59  insn per cycle         
-       0.736476884 seconds time elapsed
+TOTAL       :     0.730286 sec
+     1,357,956,935      cycles                           #    1.851 GHz                    
+     2,164,994,730      instructions                     #    1.59  insn per cycle         
+       0.734341079 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3475) (512y:   34) (512z:79492)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 67db6760e6..c2c8a96928 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:16:34
+DATE: 2023-11-08_21:31:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.697393e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.698008e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.698206e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.686778e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.687273e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.687409e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.168263 sec
-     7,466,161,453      cycles                           #    3.002 GHz                    
-    16,782,968,221      instructions                     #    2.25  insn per cycle         
-       2.544374597 seconds time elapsed
+TOTAL       :     2.171824 sec
+     7,456,169,166      cycles                           #    2.995 GHz                    
+    14,898,137,129      instructions                     #    2.00  insn per cycle         
+       2.549362993 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.111494e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.111753e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111788e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.112892e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.113171e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.113203e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.403934 sec
-    11,261,999,951      cycles                           #    3.015 GHz                    
-    23,279,217,600      instructions                     #    2.07  insn per cycle         
-       3.795199307 seconds time elapsed
+TOTAL       :     3.401203 sec
+    11,249,483,891      cycles                           #    3.009 GHz                    
+    24,262,391,957      instructions                     #    2.16  insn per cycle         
+       3.794357278 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.891205e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.891420e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.891420e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.772311e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.772526e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.772526e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.695697 sec
-    19,121,802,644      cycles                           #    2.855 GHz                    
-    54,152,938,154      instructions                     #    2.83  insn per cycle         
-       6.699723618 seconds time elapsed
+TOTAL       :     6.810863 sec
+    19,135,542,784      cycles                           #    2.808 GHz                    
+    54,153,577,866      instructions                     #    2.83  insn per cycle         
+       6.814854998 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32066) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.589938e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.590022e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.590022e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.589475e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.589562e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.589562e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.327144 sec
-     9,411,187,085      cycles                           #    2.826 GHz                    
-    26,159,441,613      instructions                     #    2.78  insn per cycle         
-       3.331341639 seconds time elapsed
+TOTAL       :     3.327738 sec
+     9,417,973,850      cycles                           #    2.827 GHz                    
+    26,159,432,180      instructions                     #    2.78  insn per cycle         
+       3.331899471 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96005) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.556465e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.556911e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556911e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.728829e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.729288e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.729288e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.488421 sec
-     4,038,495,427      cycles                           #    2.707 GHz                    
-     9,228,280,089      instructions                     #    2.29  insn per cycle         
-       1.492543554 seconds time elapsed
+TOTAL       :     1.420979 sec
+     4,041,656,459      cycles                           #    2.838 GHz                    
+     9,227,906,681      instructions                     #    2.28  insn per cycle         
+       1.425059392 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84155) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.276116e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.276827e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.276827e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.219686e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.220314e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.220314e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.240270 sec
-     3,525,917,357      cycles                           #    2.835 GHz                    
-     8,175,363,577      instructions                     #    2.32  insn per cycle         
-       1.244573424 seconds time elapsed
+TOTAL       :     1.256653 sec
+     3,545,597,499      cycles                           #    2.814 GHz                    
+     8,175,250,543      instructions                     #    2.31  insn per cycle         
+       1.260805357 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79844) (512y:   79) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.671636e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.672174e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.672174e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.660023e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.660558e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.660558e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.443903 sec
-     2,654,961,238      cycles                           #    1.834 GHz                    
-     4,155,116,507      instructions                     #    1.57  insn per cycle         
-       1.448186385 seconds time elapsed
+TOTAL       :     1.447622 sec
+     2,657,673,224      cycles                           #    1.832 GHz                    
+     4,154,915,823      instructions                     #    1.56  insn per cycle         
+       1.451764331 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2045) (512y:   93) (512z:78760)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
index ba876e5994..485a0059f2 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-03_19:17:35
+DATE: 2023-11-08_21:32:35
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.679011e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.679665e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.679866e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.688491e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.689012e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.689176e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.173579 sec
-     7,474,410,637      cycles                           #    3.001 GHz                    
-    15,946,585,145      instructions                     #    2.13  insn per cycle         
-       2.550103231 seconds time elapsed
+TOTAL       :     2.168620 sec
+     7,451,542,055      cycles                           #    2.994 GHz                    
+    15,551,253,703      instructions                     #    2.09  insn per cycle         
+       2.545633518 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.109202e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.109461e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.109492e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.107863e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.108135e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.108171e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.402138 sec
-    11,227,553,919      cycles                           #    3.005 GHz                    
-    23,286,904,291      instructions                     #    2.07  insn per cycle         
-       3.792137186 seconds time elapsed
+TOTAL       :     3.405108 sec
+    11,192,783,578      cycles                           #    3.001 GHz                    
+    25,734,796,379      instructions                     #    2.30  insn per cycle         
+       3.786804275 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.862068e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.862272e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.862272e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.066104e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.066369e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.066369e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.723543 sec
-    19,074,467,052      cycles                           #    2.836 GHz                    
-    54,156,087,092      instructions                     #    2.84  insn per cycle         
-       6.727488337 seconds time elapsed
+TOTAL       :     6.548059 sec
+    19,079,779,477      cycles                           #    2.913 GHz                    
+    54,153,651,610      instructions                     #    2.84  insn per cycle         
+       6.552064899 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32243) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.568667e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.568765e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.568765e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.589149e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.589238e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.589238e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.375716 sec
-     9,382,313,393      cycles                           #    2.776 GHz                    
-    26,079,058,590      instructions                     #    2.78  insn per cycle         
-       3.379999018 seconds time elapsed
+TOTAL       :     3.327579 sec
+     9,382,040,636      cycles                           #    2.817 GHz                    
+    26,078,619,591      instructions                     #    2.78  insn per cycle         
+       3.331633706 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:95899) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.662540e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.663002e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.663002e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.662193e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.662639e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.662639e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.448110 sec
-     4,074,555,185      cycles                           #    2.807 GHz                    
-     9,213,769,276      instructions                     #    2.26  insn per cycle         
-       1.452285529 seconds time elapsed
+TOTAL       :     1.447113 sec
+     4,073,138,574      cycles                           #    2.808 GHz                    
+     9,213,586,675      instructions                     #    2.26  insn per cycle         
+       1.451209760 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83776) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.250454e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.251202e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.251202e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.194379e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.195039e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.195039e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.248074 sec
-     3,536,570,557      cycles                           #    2.826 GHz                    
-     8,168,521,757      instructions                     #    2.31  insn per cycle         
-       1.252256213 seconds time elapsed
+TOTAL       :     1.264023 sec
+     3,548,672,085      cycles                           #    2.800 GHz                    
+     8,168,128,611      instructions                     #    2.30  insn per cycle         
+       1.268138683 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79373) (512y:  229) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.691090e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.691677e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.691677e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.707082e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.707666e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.707666e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.437256 sec
-     2,622,132,529      cycles                           #    1.820 GHz                    
-     4,153,851,791      instructions                     #    1.58  insn per cycle         
-       1.441375266 seconds time elapsed
+TOTAL       :     1.430570 sec
+     2,620,935,291      cycles                           #    1.830 GHz                    
+     4,154,056,327      instructions                     #    1.58  insn per cycle         
+       1.434770233 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1492) (512y:  175) (512z:78776)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 32c5e2345e..45ec48d9b4 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:11:10
+DATE: 2023-11-08_21:26:06
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.931878e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.341004e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.663503e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.850720e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.319691e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.646421e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.446607 sec
-     1,970,164,515      cycles                           #    2.938 GHz                    
-     2,759,248,123      instructions                     #    1.40  insn per cycle         
-       0.729204009 seconds time elapsed
+TOTAL       :     0.445414 sec
+     1,963,940,666      cycles                           #    2.941 GHz                    
+     2,761,951,187      instructions                     #    1.41  insn per cycle         
+       0.725454441 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.710415e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.163714e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.497427e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.571453e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.132541e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.489040e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.523022 sec
-     2,217,601,456      cycles                           #    2.938 GHz                    
-     3,205,519,009      instructions                     #    1.45  insn per cycle         
-       0.813078242 seconds time elapsed
+TOTAL       :     0.525989 sec
+     2,266,225,819      cycles                           #    2.950 GHz                    
+     3,255,459,976      instructions                     #    1.44  insn per cycle         
+       0.825689166 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.073669e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.096220e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.096220e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.074272e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.096702e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.096702e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.548155 sec
-     4,698,700,649      cycles                           #    3.029 GHz                    
-    13,467,797,998      instructions                     #    2.87  insn per cycle         
-       1.552304744 seconds time elapsed
+TOTAL       :     1.547835 sec
+     4,705,088,880      cycles                           #    3.034 GHz                    
+    13,467,070,551      instructions                     #    2.86  insn per cycle         
+       1.551905661 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  860) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.948763e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.021816e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.021816e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.836387e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.906822e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.906822e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.862036 sec
-     2,624,478,574      cycles                           #    3.032 GHz                    
-     7,556,486,050      instructions                     #    2.88  insn per cycle         
-       0.866308924 seconds time elapsed
+TOTAL       :     0.914850 sec
+     2,629,820,703      cycles                           #    2.863 GHz                    
+     7,555,643,977      instructions                     #    2.87  insn per cycle         
+       0.919312372 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3095) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.306326e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.524533e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.524533e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.179916e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.388522e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.388522e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.517734 sec
-     1,480,526,951      cycles                           #    2.839 GHz                    
-     3,123,082,416      instructions                     #    2.11  insn per cycle         
-       0.522085763 seconds time elapsed
+TOTAL       :     0.538121 sec
+     1,483,909,982      cycles                           #    2.739 GHz                    
+     3,122,112,991      instructions                     #    2.10  insn per cycle         
+       0.542506000 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.669407e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.933881e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.933881e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.492769e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.748148e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.748148e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.468132 sec
-     1,341,729,382      cycles                           #    2.844 GHz                    
-     2,984,537,487      instructions                     #    2.22  insn per cycle         
-       0.472335074 seconds time elapsed
+TOTAL       :     0.492302 sec
+     1,352,205,323      cycles                           #    2.727 GHz                    
+     2,983,986,621      instructions                     #    2.21  insn per cycle         
+       0.496759795 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2694) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.279474e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.384367e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.384367e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.316160e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.426685e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.426685e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.743758 sec
-     1,327,382,690      cycles                           #    1.776 GHz                    
-     1,956,119,028      instructions                     #    1.47  insn per cycle         
-       0.747985259 seconds time elapsed
+TOTAL       :     0.732612 sec
+     1,330,714,647      cycles                           #    1.807 GHz                    
+     1,956,053,126      instructions                     #    1.47  insn per cycle         
+       0.737097876 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1372) (512y:  106) (512z: 2173)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
index 83cbc116b3..9573fdc8ac 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:36:51
+DATE: 2023-11-08_21:53:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.568026e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.132079e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.132079e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.674751e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.241786e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.241786e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.473711 sec
-     2,006,451,769      cycles                           #    2.929 GHz                    
-     2,970,353,925      instructions                     #    1.48  insn per cycle         
-       0.742629859 seconds time elapsed
+TOTAL       :     0.472075 sec
+     2,011,630,446      cycles                           #    2.946 GHz                    
+     2,977,593,506      instructions                     #    1.48  insn per cycle         
+       0.740354864 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.250433e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.283042e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.283042e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.306214e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.374405e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.374405e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.748674 sec
-     3,002,657,574      cycles                           #    2.966 GHz                    
-     4,543,695,427      instructions                     #    1.51  insn per cycle         
-       1.069550305 seconds time elapsed
+TOTAL       :     0.746007 sec
+     2,930,819,015      cycles                           #    2.951 GHz                    
+     4,513,689,699      instructions                     #    1.54  insn per cycle         
+       1.051041659 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.069178e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.091931e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.091931e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.067462e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.089978e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.089978e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.560726 sec
-     4,731,718,585      cycles                           #    3.025 GHz                    
-    13,472,168,375      instructions                     #    2.85  insn per cycle         
-       1.565141837 seconds time elapsed
+TOTAL       :     1.563342 sec
+     4,743,647,659      cycles                           #    3.027 GHz                    
+    13,474,115,002      instructions                     #    2.84  insn per cycle         
+       1.567732700 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  860) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.899999e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.973174e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.973174e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.931899e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.004806e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.004806e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.892296 sec
-     2,670,244,018      cycles                           #    2.980 GHz                    
-     7,605,526,435      instructions                     #    2.85  insn per cycle         
-       0.896907337 seconds time elapsed
+TOTAL       :     0.876421 sec
+     2,657,928,129      cycles                           #    3.020 GHz                    
+     7,605,320,089      instructions                     #    2.86  insn per cycle         
+       0.880831982 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3095) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.091835e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.296236e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.296236e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.284426e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.500691e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.500691e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.561077 sec
-     1,524,432,631      cycles                           #    2.698 GHz                    
-     3,172,781,548      instructions                     #    2.08  insn per cycle         
-       0.565642937 seconds time elapsed
+TOTAL       :     0.528369 sec
+     1,515,520,073      cycles                           #    2.846 GHz                    
+     3,173,010,189      instructions                     #    2.09  insn per cycle         
+       0.533003329 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.608228e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.871141e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.871141e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.626971e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.890157e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.890157e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.483758 sec
-     1,382,209,807      cycles                           #    2.835 GHz                    
-     3,035,256,040      instructions                     #    2.20  insn per cycle         
-       0.488244630 seconds time elapsed
+TOTAL       :     0.480865 sec
+     1,378,241,594      cycles                           #    2.844 GHz                    
+     3,034,725,088      instructions                     #    2.20  insn per cycle         
+       0.485339539 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2694) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.425183e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.544675e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.544675e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.445545e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.566504e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.566504e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.707575 sec
-     1,368,070,277      cycles                           #    1.923 GHz                    
-     1,995,483,449      instructions                     #    1.46  insn per cycle         
-       0.712159059 seconds time elapsed
+TOTAL       :     0.701984 sec
+     1,365,857,372      cycles                           #    1.935 GHz                    
+     1,995,672,274      instructions                     #    1.46  insn per cycle         
+       0.706431315 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1372) (512y:  106) (512z: 2173)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
index 5c16312148..a982c1092c 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:11:27
+DATE: 2023-11-08_21:26:24
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.898292e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.236740e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.548470e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.808432e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.231946e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.554075e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.444696 sec
-     1,938,726,844      cycles                           #    2.937 GHz                    
-     2,756,323,630      instructions                     #    1.42  insn per cycle         
-       0.718363875 seconds time elapsed
+TOTAL       :     0.446409 sec
+     1,914,723,819      cycles                           #    2.858 GHz                    
+     2,720,530,830      instructions                     #    1.42  insn per cycle         
+       0.726781250 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.682843e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.082328e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.409380e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.542836e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.030986e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.390683e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.522202 sec
-     2,220,530,283      cycles                           #    2.941 GHz                    
-     3,184,953,404      instructions                     #    1.43  insn per cycle         
-       0.811776517 seconds time elapsed
+TOTAL       :     0.529359 sec
+     2,191,645,691      cycles                           #    2.864 GHz                    
+     3,157,433,372      instructions                     #    1.44  insn per cycle         
+       0.823018193 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.070337e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.092872e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.092872e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.036305e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.058434e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.058434e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.552829 sec
-     4,705,329,544      cycles                           #    3.023 GHz                    
-    13,461,758,666      instructions                     #    2.86  insn per cycle         
-       1.556952692 seconds time elapsed
+TOTAL       :     1.603791 sec
+     4,708,850,584      cycles                           #    2.929 GHz                    
+    13,461,227,684      instructions                     #    2.86  insn per cycle         
+       1.607981971 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  849) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.948045e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.021952e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.021952e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.854678e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.928501e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.928501e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.862207 sec
-     2,624,178,818      cycles                           #    3.031 GHz                    
-     7,555,487,904      instructions                     #    2.88  insn per cycle         
-       0.866510467 seconds time elapsed
+TOTAL       :     0.906299 sec
+     2,638,123,420      cycles                           #    2.899 GHz                    
+     7,554,662,347      instructions                     #    2.86  insn per cycle         
+       0.910729092 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3088) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.292100e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.512278e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.512278e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.120658e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.331862e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.331862e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.519779 sec
-     1,479,324,919      cycles                           #    2.825 GHz                    
-     3,121,432,800      instructions                     #    2.11  insn per cycle         
-       0.524166869 seconds time elapsed
+TOTAL       :     0.548282 sec
+     1,490,121,110      cycles                           #    2.699 GHz                    
+     3,120,571,278      instructions                     #    2.09  insn per cycle         
+       0.552853693 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2900) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.586783e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.851292e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.851292e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.460892e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.716719e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.716719e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.479341 sec
-     1,345,156,968      cycles                           #    2.785 GHz                    
-     2,982,279,143      instructions                     #    2.22  insn per cycle         
-       0.483569808 seconds time elapsed
+TOTAL       :     0.496477 sec
+     1,349,987,385      cycles                           #    2.699 GHz                    
+     2,981,775,320      instructions                     #    2.21  insn per cycle         
+       0.500801099 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2670) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.481639e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.600263e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.600263e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.283025e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.395178e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.395178e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.683909 sec
-     1,326,826,217      cycles                           #    1.930 GHz                    
-     1,955,120,469      instructions                     #    1.47  insn per cycle         
-       0.688253496 seconds time elapsed
+TOTAL       :     0.742923 sec
+     1,336,539,142      cycles                           #    1.791 GHz                    
+     1,954,402,399      instructions                     #    1.46  insn per cycle         
+       0.747445158 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1348) (512y:  106) (512z: 2173)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 59e9dbfb13..0870ac1612 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:11:45
+DATE: 2023-11-08_21:26:42
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.904199e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.231536e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.359887e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.731772e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.218499e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.346344e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.438419 sec
-     1,915,720,301      cycles                           #    2.940 GHz                    
-     2,722,845,778      instructions                     #    1.42  insn per cycle         
-       0.708695201 seconds time elapsed
+TOTAL       :     0.443647 sec
+     1,860,995,101      cycles                           #    2.829 GHz                    
+     2,577,640,181      instructions                     #    1.39  insn per cycle         
+       0.715495532 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.256707e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.834983e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.952518e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.975472e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.830515e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.954464e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.571361e+02 +- 2.114021e+02 )  GeV^-2
-TOTAL       :     0.473385 sec
-     2,068,832,196      cycles                           #    2.955 GHz                    
-     2,965,580,704      instructions                     #    1.43  insn per cycle         
-       0.757067346 seconds time elapsed
+TOTAL       :     0.479313 sec
+     1,996,286,635      cycles                           #    2.831 GHz                    
+     2,879,932,210      instructions                     #    1.44  insn per cycle         
+       0.762534142 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.135878e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.161149e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.161149e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.068560e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.092999e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.092999e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.462461 sec
-     4,454,737,328      cycles                           #    3.039 GHz                    
-    13,053,159,453      instructions                     #    2.93  insn per cycle         
-       1.466494148 seconds time elapsed
+TOTAL       :     1.555116 sec
+     4,461,765,661      cycles                           #    2.863 GHz                    
+    13,052,553,175      instructions                     #    2.93  insn per cycle         
+       1.559192669 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  745) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.046237e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.238088e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.238088e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.882925e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.070631e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.070631e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429902e+01 )  GeV^-2
-TOTAL       :     0.557296 sec
-     1,699,998,155      cycles                           #    3.031 GHz                    
-     4,515,681,552      instructions                     #    2.66  insn per cycle         
-       0.561435544 seconds time elapsed
+TOTAL       :     0.589518 sec
+     1,706,750,598      cycles                           #    2.878 GHz                    
+     4,515,023,670      instructions                     #    2.65  insn per cycle         
+       0.593859816 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3601) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.648399e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.355867e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.355867e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.765834e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.493743e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.493743e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.311054 sec
-       851,131,460      cycles                           #    2.704 GHz                    
-     1,899,263,660      instructions                     #    2.23  insn per cycle         
-       0.315235937 seconds time elapsed
+TOTAL       :     0.305319 sec
+       853,645,854      cycles                           #    2.763 GHz                    
+     1,898,477,314      instructions                     #    2.22  insn per cycle         
+       0.309705869 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3491) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.243995e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.098185e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.098185e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.141881e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.979826e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.979826e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.282792 sec
-       800,211,416      cycles                           #    2.794 GHz                    
-     1,822,370,089      instructions                     #    2.28  insn per cycle         
-       0.286974618 seconds time elapsed
+TOTAL       :     0.287752 sec
+       800,772,449      cycles                           #    2.748 GHz                    
+     1,821,769,219      instructions                     #    2.28  insn per cycle         
+       0.292040341 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3335) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe
@@ -194,9 +194,9 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        29,217,754      cycles                           #    2.652 GHz                    
-        42,284,295      instructions                     #    1.45  insn per cycle         
-       0.011406114 seconds time elapsed
+        29,120,008      cycles                           #    2.647 GHz                    
+        41,681,258      instructions                     #    1.43  insn per cycle         
+       0.011379573 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1969) (512y:   32) (512z: 2383)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
index f15afb12c1..0597ee22a3 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:37:10
+DATE: 2023-11-08_21:53:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.572083e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.023629e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.023629e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.639706e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.257120e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.257120e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.017654e+01 +- 1.429184e+01 )  GeV^-2
-TOTAL       :     0.454387 sec
-     1,955,352,187      cycles                           #    2.938 GHz                    
-     2,863,812,902      instructions                     #    1.46  insn per cycle         
-       0.722319097 seconds time elapsed
+TOTAL       :     0.449974 sec
+     1,947,595,961      cycles                           #    2.942 GHz                    
+     2,880,549,080      instructions                     #    1.48  insn per cycle         
+       0.719459148 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.087118e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.599283e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.599283e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.168463e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.812098e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.812098e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.609942e+02 +- 2.115590e+02 )  GeV^-2
-TOTAL       :     0.623566 sec
-     2,498,674,729      cycles                           #    2.923 GHz                    
-     3,766,117,574      instructions                     #    1.51  insn per cycle         
-       0.913465239 seconds time elapsed
+TOTAL       :     0.616345 sec
+     2,486,978,198      cycles                           #    2.935 GHz                    
+     3,790,315,811      instructions                     #    1.52  insn per cycle         
+       0.904030203 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.124937e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.150391e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.150391e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.130080e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.155980e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.155980e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.480709 sec
-     4,471,348,915      cycles                           #    3.013 GHz                    
-    13,056,806,498      instructions                     #    2.92  insn per cycle         
-       1.485019275 seconds time elapsed
+TOTAL       :     1.473693 sec
+     4,471,154,333      cycles                           #    3.027 GHz                    
+    13,056,458,670      instructions                     #    2.92  insn per cycle         
+       1.477812555 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  745) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.015036e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.208624e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.208624e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.025004e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.219703e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.219703e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429902e+01 )  GeV^-2
-TOTAL       :     0.567631 sec
-     1,721,622,943      cycles                           #    3.014 GHz                    
-     4,563,283,810      instructions                     #    2.65  insn per cycle         
-       0.571796628 seconds time elapsed
+TOTAL       :     0.566150 sec
+     1,723,667,712      cycles                           #    3.025 GHz                    
+     4,563,297,886      instructions                     #    2.65  insn per cycle         
+       0.570436411 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3601) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.904492e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.650265e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.650265e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.858362e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.587732e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.587732e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.302253 sec
-       872,846,100      cycles                           #    2.852 GHz                    
-     1,935,401,156      instructions                     #    2.22  insn per cycle         
-       0.306655862 seconds time elapsed
+TOTAL       :     0.303895 sec
+       871,800,602      cycles                           #    2.835 GHz                    
+     1,935,423,519      instructions                     #    2.22  insn per cycle         
+       0.308064640 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3491) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.271441e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.120717e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.120717e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.340209e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.201036e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.201036e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.285638 sec
-       819,147,203      cycles                           #    2.831 GHz                    
-     1,858,340,668      instructions                     #    2.27  insn per cycle         
-       0.289825539 seconds time elapsed
+TOTAL       :     0.282456 sec
+       818,779,897      cycles                           #    2.862 GHz                    
+     1,858,681,592      instructions                     #    2.27  insn per cycle         
+       0.286757422 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3335) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe
@@ -211,9 +211,9 @@ OK (relative difference <= 5E-3)
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Instantiate host Bridge (nevt=16384)
-        37,779,421      cycles                           #    2.664 GHz                    
-        50,267,131      instructions                     #    1.33  insn per cycle         
-       0.014729622 seconds time elapsed
+        37,403,629      cycles                           #    2.691 GHz                    
+        50,469,890      instructions                     #    1.35  insn per cycle         
+       0.014372629 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1969) (512y:   32) (512z: 2383)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
index c8e32c45f6..1f88f16cf0 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:12:01
+DATE: 2023-11-08_21:26:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.816263e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.233557e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.356584e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.710525e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.199979e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.326321e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.439029 sec
-     1,906,384,387      cycles                           #    2.932 GHz                    
-     2,668,630,925      instructions                     #    1.40  insn per cycle         
-       0.709025104 seconds time elapsed
+TOTAL       :     0.440656 sec
+     1,914,804,492      cycles                           #    2.925 GHz                    
+     2,653,138,253      instructions                     #    1.39  insn per cycle         
+       0.711749358 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.165457e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.788318e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.899924e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.891594e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.784645e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.904203e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.571361e+02 +- 2.114021e+02 )  GeV^-2
-TOTAL       :     0.475153 sec
-     2,060,825,458      cycles                           #    2.945 GHz                    
-     2,959,751,148      instructions                     #    1.44  insn per cycle         
-       0.758667305 seconds time elapsed
+TOTAL       :     0.476610 sec
+     2,083,856,457      cycles                           #    2.940 GHz                    
+     2,965,032,628      instructions                     #    1.42  insn per cycle         
+       0.765879589 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.129555e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154905e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.154905e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.129719e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.155174e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.155174e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.470613 sec
-     4,452,780,841      cycles                           #    3.021 GHz                    
-    13,033,295,085      instructions                     #    2.93  insn per cycle         
-       1.474743963 seconds time elapsed
+TOTAL       :     1.470593 sec
+     4,452,128,732      cycles                           #    3.020 GHz                    
+    13,033,118,765      instructions                     #    2.93  insn per cycle         
+       1.474660881 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  727) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.000043e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.190804e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.190804e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.040157e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.234537e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.234537e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429902e+01 )  GeV^-2
-TOTAL       :     0.566289 sec
-     1,691,331,084      cycles                           #    2.968 GHz                    
-     4,511,809,710      instructions                     #    2.67  insn per cycle         
-       0.570477990 seconds time elapsed
+TOTAL       :     0.558718 sec
+     1,691,566,910      cycles                           #    3.008 GHz                    
+     4,511,110,866      instructions                     #    2.67  insn per cycle         
+       0.562886591 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3589) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.392978e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.034440e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.034440e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.942184e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.690459e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.690459e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.325119 sec
-       853,124,200      cycles                           #    2.596 GHz                    
-     1,896,337,755      instructions                     #    2.22  insn per cycle         
-       0.329328797 seconds time elapsed
+TOTAL       :     0.296099 sec
+       853,486,904      cycles                           #    2.847 GHz                    
+     1,895,390,282      instructions                     #    2.22  insn per cycle         
+       0.300311325 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3461) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.399192e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.280649e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.280649e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.374489e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.242458e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.242458e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.275864 sec
-       799,266,525      cycles                           #    2.860 GHz                    
-     1,818,357,527      instructions                     #    2.28  insn per cycle         
-       0.279975539 seconds time elapsed
+TOTAL       :     0.277008 sec
+       800,885,707      cycles                           #    2.855 GHz                    
+     1,817,516,411      instructions                     #    2.27  insn per cycle         
+       0.281135474 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3298) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe
@@ -194,9 +194,9 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        28,695,242      cycles                           #    2.686 GHz                    
-        41,682,313      instructions                     #    1.45  insn per cycle         
-       0.011083970 seconds time elapsed
+        28,754,068      cycles                           #    2.640 GHz                    
+        40,955,371      instructions                     #    1.42  insn per cycle         
+       0.011419598 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1932) (512y:   32) (512z: 2383)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index 2f090614c3..d5ef07e007 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:12:17
+DATE: 2023-11-08_21:27:16
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.924011e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.312316e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.652376e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.821562e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.300473e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.628825e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.444193 sec
-     1,982,708,723      cycles                           #    2.960 GHz                    
-     2,773,326,834      instructions                     #    1.40  insn per cycle         
-       0.727594315 seconds time elapsed
+TOTAL       :     0.447954 sec
+     1,932,564,435      cycles                           #    2.921 GHz                    
+     2,743,560,511      instructions                     #    1.42  insn per cycle         
+       0.719731147 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.716781e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.189044e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.525460e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.575286e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.143575e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.499311e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.521362 sec
-     2,209,841,644      cycles                           #    2.939 GHz                    
-     3,173,284,555      instructions                     #    1.44  insn per cycle         
-       0.811280771 seconds time elapsed
+TOTAL       :     0.523239 sec
+     2,243,203,062      cycles                           #    2.949 GHz                    
+     3,244,551,518      instructions                     #    1.45  insn per cycle         
+       0.818196957 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.069544e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.093797e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.093797e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.069116e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.091100e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.091100e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.554251 sec
-     4,735,824,731      cycles                           #    3.041 GHz                    
-    13,470,683,397      instructions                     #    2.84  insn per cycle         
-       1.558385201 seconds time elapsed
+TOTAL       :     1.554535 sec
+     4,725,018,841      cycles                           #    3.035 GHz                    
+    13,469,753,614      instructions                     #    2.85  insn per cycle         
+       1.558693291 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  840) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.965218e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.040121e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.040121e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.970313e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.046371e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.046371e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.855114 sec
-     2,601,303,673      cycles                           #    3.029 GHz                    
-     7,389,579,625      instructions                     #    2.84  insn per cycle         
-       0.859411839 seconds time elapsed
+TOTAL       :     0.853333 sec
+     2,596,868,107      cycles                           #    3.030 GHz                    
+     7,388,624,187      instructions                     #    2.85  insn per cycle         
+       0.857591565 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3073) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.103178e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.304731e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.304731e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.332912e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.554037e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.554037e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.550950 sec
-     1,470,989,933      cycles                           #    2.653 GHz                    
-     3,058,765,662      instructions                     #    2.08  insn per cycle         
-       0.555184249 seconds time elapsed
+TOTAL       :     0.513899 sec
+     1,466,763,063      cycles                           #    2.835 GHz                    
+     3,057,876,447      instructions                     #    2.08  insn per cycle         
+       0.518107133 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3013) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.774277e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.060098e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.060098e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.777029e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.058907e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.058907e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.455765 sec
-     1,309,522,407      cycles                           #    2.852 GHz                    
-     2,933,428,757      instructions                     #    2.24  insn per cycle         
-       0.459981977 seconds time elapsed
+TOTAL       :     0.455720 sec
+     1,306,910,741      cycles                           #    2.845 GHz                    
+     2,932,818,419      instructions                     #    2.24  insn per cycle         
+       0.460076062 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2799) (512y:  110) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.411920e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.526016e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.526016e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.391166e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.500870e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.500870e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.703353 sec
-     1,366,582,014      cycles                           #    1.933 GHz                    
-     1,972,774,215      instructions                     #    1.44  insn per cycle         
-       0.707707323 seconds time elapsed
+TOTAL       :     0.709219 sec
+     1,365,455,058      cycles                           #    1.916 GHz                    
+     1,971,797,344      instructions                     #    1.44  insn per cycle         
+       0.713482957 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1700) (512y:  114) (512z: 2171)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
index f9fb6155f7..6e69f82aee 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-03_19:12:35
+DATE: 2023-11-08_21:27:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.886874e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.228157e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.568514e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.812345e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.208019e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.520614e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.444068 sec
-     1,946,521,853      cycles                           #    2.951 GHz                    
-     2,755,422,178      instructions                     #    1.42  insn per cycle         
-       0.717280231 seconds time elapsed
+TOTAL       :     0.446229 sec
+     1,955,610,259      cycles                           #    2.936 GHz                    
+     2,744,647,203      instructions                     #    1.40  insn per cycle         
+       0.725146174 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.675020e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.027076e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.349457e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.529337e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.985996e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.326246e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.523198 sec
-     2,222,274,900      cycles                           #    2.946 GHz                    
-     3,198,191,753      instructions                     #    1.44  insn per cycle         
-       0.813003520 seconds time elapsed
+TOTAL       :     0.522218 sec
+     2,238,618,942      cycles                           #    2.943 GHz                    
+     3,202,939,408      instructions                     #    1.43  insn per cycle         
+       0.817604471 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.069395e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.091866e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.091866e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.065182e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.087517e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.087517e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.554132 sec
-     4,733,166,680      cycles                           #    3.039 GHz                    
-    13,456,716,984      instructions                     #    2.84  insn per cycle         
-       1.558278315 seconds time elapsed
+TOTAL       :     1.560291 sec
+     4,729,799,308      cycles                           #    3.025 GHz                    
+    13,455,876,389      instructions                     #    2.84  insn per cycle         
+       1.564515481 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  827) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.963106e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.038064e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.038064e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.946971e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.020229e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.020229e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.856001 sec
-     2,603,447,344      cycles                           #    3.028 GHz                    
-     7,393,362,148      instructions                     #    2.84  insn per cycle         
-       0.860294166 seconds time elapsed
+TOTAL       :     0.863034 sec
+     2,601,868,480      cycles                           #    3.003 GHz                    
+     7,392,543,085      instructions                     #    2.84  insn per cycle         
+       0.867199240 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3062) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.354162e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.573385e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.573385e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.323539e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.538773e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.538773e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.509733 sec
-     1,467,381,346      cycles                           #    2.859 GHz                    
-     3,058,521,485      instructions                     #    2.08  insn per cycle         
-       0.513844239 seconds time elapsed
+TOTAL       :     0.514660 sec
+     1,469,850,553      cycles                           #    2.835 GHz                    
+     3,058,079,146      instructions                     #    2.08  insn per cycle         
+       0.519050232 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2990) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.783084e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.065773e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.065773e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.767525e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.049329e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.049329e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.454796 sec
-     1,307,019,802      cycles                           #    2.851 GHz                    
-     2,934,565,738      instructions                     #    2.25  insn per cycle         
-       0.459066978 seconds time elapsed
+TOTAL       :     0.456721 sec
+     1,309,025,943      cycles                           #    2.843 GHz                    
+     2,933,534,120      instructions                     #    2.24  insn per cycle         
+       0.460967936 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2775) (512y:  110) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.408065e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.519741e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.519741e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.405794e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.516831e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.516831e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.704354 sec
-     1,368,218,437      cycles                           #    1.933 GHz                    
-     1,972,609,636      instructions                     #    1.44  insn per cycle         
-       0.708886358 seconds time elapsed
+TOTAL       :     0.704869 sec
+     1,364,487,579      cycles                           #    1.926 GHz                    
+     1,971,713,310      instructions                     #    1.45  insn per cycle         
+       0.709028391 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1676) (512y:  114) (512z: 2171)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe

From bbeba6dec51d6ae7fe3021444ed3e3f9391736a1 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 10:13:34 +0100
Subject: [PATCH 06/14] [gpucpp] rerun 18 tmad tests, Olivier's patch now fixes
 crash #781 in ggttggg, no change in performance

---
 .../log_eemumu_mad_d_inl0_hrd0.txt            | 136 ++---
 .../log_eemumu_mad_f_inl0_hrd0.txt            | 138 ++---
 .../log_eemumu_mad_m_inl0_hrd0.txt            | 136 ++---
 .../log_ggtt_mad_d_inl0_hrd0.txt              | 132 ++--
 .../log_ggtt_mad_f_inl0_hrd0.txt              | 138 ++---
 .../log_ggtt_mad_m_inl0_hrd0.txt              | 134 ++--
 .../log_ggttg_mad_d_inl0_hrd0.txt             | 138 ++---
 .../log_ggttg_mad_f_inl0_hrd0.txt             | 138 ++---
 .../log_ggttg_mad_m_inl0_hrd0.txt             | 134 ++--
 .../log_ggttgg_mad_d_inl0_hrd0.txt            | 134 ++--
 .../log_ggttgg_mad_f_inl0_hrd0.txt            | 138 ++---
 .../log_ggttgg_mad_m_inl0_hrd0.txt            | 136 ++---
 .../log_ggttggg_mad_d_inl0_hrd0.txt           | 572 +++++++++++++++++-
 .../log_ggttggg_mad_f_inl0_hrd0.txt           | 572 +++++++++++++++++-
 .../log_ggttggg_mad_m_inl0_hrd0.txt           | 572 +++++++++++++++++-
 .../log_gqttq_mad_d_inl0_hrd0.txt             | 138 ++---
 .../log_gqttq_mad_f_inl0_hrd0.txt             | 134 ++--
 .../log_gqttq_mad_m_inl0_hrd0.txt             | 136 ++---
 18 files changed, 2685 insertions(+), 1071 deletions(-)

diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index bcf56600ba..383178f656 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.'
 
 
 
-make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -16,24 +16,24 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:52:13
+DATE: 2023-11-08_22:08:17
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 3893 events (found 7395 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6373s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6287s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6257s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6178s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1807s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1728s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.03E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1766s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1680s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.60E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4217s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3352s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0865s for    90112 events => throughput is 1.04E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4156s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3301s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0855s for    90112 events => throughput is 1.05E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1919s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1852s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0067s for     8192 events => throughput is 1.23E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1878s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1815s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for     8192 events => throughput is 1.30E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4231s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3509s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0722s for    90112 events => throughput is 1.25E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4131s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3430s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0701s for    90112 events => throughput is 1.29E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.217666e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.227734e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.241611e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.242066e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,8 +210,8 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1865s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1826s
+ [COUNTERS] PROGRAM TOTAL          :    0.1813s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1774s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0039s for     8192 events => throughput is 2.10E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3926s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3476s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0451s for    90112 events => throughput is 2.00E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3861s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3422s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0439s for    90112 events => throughput is 2.05E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.991197e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.002470e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.990100e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.006601e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1864s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1832s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0032s for     8192 events => throughput is 2.59E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1828s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1796s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0032s for     8192 events => throughput is 2.57E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3800s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3465s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0335s for    90112 events => throughput is 2.69E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3731s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3402s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0329s for    90112 events => throughput is 2.74E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.603611e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.620678e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.718712e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.819190e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1833s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1805s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.93E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1815s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1784s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.71E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3774s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3449s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0324s for    90112 events => throughput is 2.78E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3723s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3407s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0316s for    90112 events => throughput is 2.85E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.713996e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.820321e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.775269e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.842053e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1890s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1855s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.35E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1819s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1785s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0034s for     8192 events => throughput is 2.38E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3894s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3496s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0398s for    90112 events => throughput is 2.26E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3824s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3420s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0404s for    90112 events => throughput is 2.23E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.190424e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.075096e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.183626e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.166357e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5997s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5992s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.68E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5934s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5929s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.63E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7696s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7647s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.85E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7863s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7814s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0050s for    90112 events => throughput is 1.81E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.173877e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.141020e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.893710e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.873271e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.716630e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.990853e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.387595e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.361218e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.739579e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.939860e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.929113e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.944408e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.693635e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.975323e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.118370e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.124184e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index ff3c2ae8d4..4b3b0b9b07 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -4,8 +4,8 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,25 +15,25 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:52:30
+DATE: 2023-11-08_22:08:35
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 3893 events (found 7395 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6418s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6338s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0080s for     8192 events => throughput is 1.02E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6276s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6195s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0081s for     8192 events => throughput is 1.01E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1827s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1742s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.67E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1778s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1697s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0081s for     8192 events => throughput is 1.01E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4264s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3383s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0882s for    90112 events => throughput is 1.02E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4139s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3290s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0849s for    90112 events => throughput is 1.06E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166087172673] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1909s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1845s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for     8192 events => throughput is 1.27E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1874s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1813s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0062s for     8192 events => throughput is 1.33E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501907796603360E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4197s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3492s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0705s for    90112 events => throughput is 1.28E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4142s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3454s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0687s for    90112 events => throughput is 1.31E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.260485e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.261327e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.240620e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.287607e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1824s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1798s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.18E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1786s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1761s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0025s for     8192 events => throughput is 3.33E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3742s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3464s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0278s for    90112 events => throughput is 3.24E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3651s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3380s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0270s for    90112 events => throughput is 3.33E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.182676e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.137840e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.343050e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.298087e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1914s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1892s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.72E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1864s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1841s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.61E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3767s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3513s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0254s for    90112 events => throughput is 3.55E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3868s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3609s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0260s for    90112 events => throughput is 3.47E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.496883e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.442542e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.660390e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.634986e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1867s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1844s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.57E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1856s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1832s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.50E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3763s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3515s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0248s for    90112 events => throughput is 3.63E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3685s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3440s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0245s for    90112 events => throughput is 3.68E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.562187e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.588607e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.601892e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.872180e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166440400542] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1875s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1852s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.57E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1887s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1865s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.71E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501908978565555E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3774s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3519s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0255s for    90112 events => throughput is 3.53E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3693s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3442s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0251s for    90112 events => throughput is 3.59E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.223682e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.372399e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.583359e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.586770e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5998s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5993s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.73E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5951s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5946s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.69E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7713s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7665s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0048s for    90112 events => throughput is 1.87E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7616s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7570s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0046s for    90112 events => throughput is 1.96E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.583398e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.577355e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.881767e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.822297e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.997979e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.937359e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.043514e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.046785e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.954785e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.102347e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.219791e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.203659e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.299152e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.365649e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.462264e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.422918e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 7741c53b46..9a947a36a5 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -1,8 +1,8 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 CUDACPP_BUILDDIR='.'
 
-make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:52:47
+DATE: 2023-11-08_22:08:51
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 3893 events (found 7395 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6387s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6300s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6267s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6186s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0081s for     8192 events => throughput is 1.01E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1817s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1737s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.03E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1781s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1703s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4238s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3373s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0865s for    90112 events => throughput is 1.04E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4162s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3310s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0853s for    90112 events => throughput is 1.06E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1953s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1883s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0070s for     8192 events => throughput is 1.17E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1898s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1832s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for     8192 events => throughput is 1.24E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4433s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3677s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0757s for    90112 events => throughput is 1.19E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4158s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3438s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0720s for    90112 events => throughput is 1.25E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.177056e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.204267e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.187091e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.208788e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1964s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1922s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0042s for     8192 events => throughput is 1.95E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1829s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1790s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0038s for     8192 events => throughput is 2.13E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4148s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3686s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0462s for    90112 events => throughput is 1.95E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3821s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3399s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0422s for    90112 events => throughput is 2.14E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.000126e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.047978e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.127276e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.116427e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1855s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1824s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.62E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1871s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1841s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.76E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3807s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3465s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0342s for    90112 events => throughput is 2.63E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3733s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3393s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0340s for    90112 events => throughput is 2.65E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.610232e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.642107e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.645393e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.787956e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1900s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1870s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.68E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1804s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1776s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.89E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3808s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3484s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0325s for    90112 events => throughput is 2.78E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3708s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3395s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0313s for    90112 events => throughput is 2.88E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.740865e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.821887e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.846547e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.874115e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1860s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1826s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.46E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1829s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1796s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0034s for     8192 events => throughput is 2.43E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3896s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3515s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0381s for    90112 events => throughput is 2.36E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3842s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3465s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0377s for    90112 events => throughput is 2.39E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.142678e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.237740e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.406082e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.366800e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5998s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5993s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.60E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5935s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5930s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.64E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7721s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7672s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.85E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7704s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7654s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0050s for    90112 events => throughput is 1.82E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.181977e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.007927e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.926668e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.918411e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.726329e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.018629e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.399920e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.348012e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.694690e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.994146e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.877527e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.917104e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.708142e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.983673e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.118945e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.123333e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 1c30dae812..3e628018af 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
+
 make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
-
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -17,8 +17,6 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
-make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
@@ -30,10 +28,12 @@ make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:53:04
+DATE: 2023-11-08_22:09:08
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 420 events (found 1577 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3686s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3264s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3517s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3111s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0406s for     8192 events => throughput is 2.02E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3158s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2736s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3086s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2681s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0405s for     8192 events => throughput is 2.02E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6988s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2402s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4586s for    90112 events => throughput is 1.97E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6533s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2091s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4442s for    90112 events => throughput is 2.03E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3516s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3139s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0377s for     8192 events => throughput is 2.17E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3456s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3078s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0379s for     8192 events => throughput is 2.16E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7148s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2977s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4171s for    90112 events => throughput is 2.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6716s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2645s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4071s for    90112 events => throughput is 2.21E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.143201e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.224417e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.178576e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.212367e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3233s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3012s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0221s for     8192 events => throughput is 3.70E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3132s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2919s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0214s for     8192 events => throughput is 3.83E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5228s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2789s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2439s for    90112 events => throughput is 3.69E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4780s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2422s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2359s for    90112 events => throughput is 3.82E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.615837e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.777989e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.718240e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.740213e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,8 +286,8 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3023s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2891s
+ [COUNTERS] PROGRAM TOTAL          :    0.2955s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2823s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0132s for     8192 events => throughput is 6.21E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4266s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2769s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1496s for    90112 events => throughput is 6.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3915s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2460s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1455s for    90112 events => throughput is 6.19E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.870487e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.030466e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.072305e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.192047e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3006s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2887s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0119s for     8192 events => throughput is 6.89E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2946s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2826s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0119s for     8192 events => throughput is 6.88E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4026s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2691s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1335s for    90112 events => throughput is 6.75E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3659s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2362s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1297s for    90112 events => throughput is 6.95E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.610205e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.841360e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.622254e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.816529e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3191s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2997s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0194s for     8192 events => throughput is 4.23E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3142s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2951s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0191s for     8192 events => throughput is 4.29E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5054s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2826s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2228s for    90112 events => throughput is 4.04E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4624s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2517s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2106s for    90112 events => throughput is 4.28E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.911690e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.955720e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.045481e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.094472e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7037s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7032s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.43E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6940s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6935s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.46E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6871s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6805s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for    90112 events => throughput is 1.37E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7032s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6968s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for    90112 events => throughput is 1.41E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.043596e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.103744e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.671088e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.691695e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.005777e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.194593e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.074802e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.070229e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.019573e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.168601e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.147636e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.149757e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.014036e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.190999e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.011683e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.017633e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 7edcebceb9..0321a276a0 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -1,11 +1,11 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 CUDACPP_BUILDDIR='.'
 
-
 make USEBUILDDIR=1 AVX=none
 
-make USEBUILDDIR=1 AVX=sse4
 
+
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:53:30
+DATE: 2023-11-08_22:09:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 420 events (found 1577 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3667s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3245s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3489s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3083s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0406s for     8192 events => throughput is 2.02E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3249s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2811s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0438s for     8192 events => throughput is 1.87E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3073s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2670s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0403s for     8192 events => throughput is 2.03E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7464s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2748s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4716s for    90112 events => throughput is 1.91E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6502s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2060s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4442s for    90112 events => throughput is 2.03E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690706767555099] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3467s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3115s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0352s for     8192 events => throughput is 2.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3425s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3079s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0345s for     8192 events => throughput is 2.37E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782605295497] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6806s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2908s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3898s for    90112 events => throughput is 2.31E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6631s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2770s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3861s for    90112 events => throughput is 2.33E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.279168e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.342613e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.299428e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.319125e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690702885183541] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3091s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2943s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0148s for     8192 events => throughput is 5.53E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3002s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2858s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0145s for     8192 events => throughput is 5.66E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223778858016772] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4417s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2764s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1652s for    90112 events => throughput is 5.45E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3973s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2359s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1614s for    90112 events => throughput is 5.58E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.234141e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.270911e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.323283e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.359921e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2903s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2825s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2834s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2758s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0075s for     8192 events => throughput is 1.09E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3699s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2805s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0894s for    90112 events => throughput is 1.01E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3197s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2356s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0842s for    90112 events => throughput is 1.07E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.010480e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.026437e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.003913e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.028771e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2913s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2842s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0071s for     8192 events => throughput is 1.15E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2894s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2821s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0073s for     8192 events => throughput is 1.12E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3439s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2627s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0812s for    90112 events => throughput is 1.11E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3159s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2370s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0789s for    90112 events => throughput is 1.14E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.090791e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.095999e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.092579e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.120004e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690698914467276] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2963s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2859s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0104s for     8192 events => throughput is 7.89E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2909s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2810s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0098s for     8192 events => throughput is 8.33E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223780273983500] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3867s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2714s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1153s for    90112 events => throughput is 7.81E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4173s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2979s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1195s for    90112 events => throughput is 7.54E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.366599e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.668644e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.487198e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.548978e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690703397697980] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7024s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7018s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.52E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6943s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6937s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.51E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6918s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6861s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0056s for    90112 events => throughput is 1.60E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6513s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6459s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0054s for    90112 events => throughput is 1.67E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.243778e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.266713e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.844714e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.234896e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.837802e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.830084e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.769339e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.762403e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.775138e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.776301e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.863954e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.872477e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.397746e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.374142e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.449606e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.426544e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 30dac17633..8bacc65fe8 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -1,11 +1,11 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 CUDACPP_BUILDDIR='.'
 
-
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
 
+
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -17,13 +17,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:53:55
+DATE: 2023-11-08_22:09:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 420 events (found 1577 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3561s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3153s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0408s for     8192 events => throughput is 2.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3627s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3194s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0433s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3103s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2697s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0406s for     8192 events => throughput is 2.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3074s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2660s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0414s for     8192 events => throughput is 1.98E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6795s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2275s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4521s for    90112 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6907s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2365s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4542s for    90112 events => throughput is 1.98E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3522s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3138s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0384s for     8192 events => throughput is 2.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3443s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3074s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0369s for     8192 events => throughput is 2.22E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7156s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2968s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4188s for    90112 events => throughput is 2.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6798s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2683s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4115s for    90112 events => throughput is 2.19E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.113023e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.164831e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.146418e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.183670e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3199s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2989s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0210s for     8192 events => throughput is 3.90E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3143s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2936s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0208s for     8192 events => throughput is 3.95E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5211s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2871s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2339s for    90112 events => throughput is 3.85E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4761s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2466s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2295s for    90112 events => throughput is 3.93E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.687467e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.799865e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.724259e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.756525e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3043s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2913s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0129s for     8192 events => throughput is 6.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2976s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2844s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0132s for     8192 events => throughput is 6.21E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4192s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2746s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1446s for    90112 events => throughput is 6.23E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4201s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2736s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1464s for    90112 events => throughput is 6.15E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.051901e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.181937e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.195854e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.243573e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2995s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2875s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0120s for     8192 events => throughput is 6.80E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2977s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2865s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0112s for     8192 events => throughput is 7.30E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3959s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2664s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1296s for    90112 events => throughput is 6.96E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3670s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2408s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1262s for    90112 events => throughput is 7.14E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.842430e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.933959e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.007264e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.064349e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3146s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2960s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0186s for     8192 events => throughput is 4.40E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3083s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2895s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0188s for     8192 events => throughput is 4.35E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5378s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3142s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2236s for    90112 events => throughput is 4.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4519s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2484s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2036s for    90112 events => throughput is 4.43E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.894022e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.266660e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.946552e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.117226e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,8 +514,8 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708266690699] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7067s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7061s
+ [COUNTERS] PROGRAM TOTAL          :    0.6949s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6943s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.46E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6929s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6862s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0067s for    90112 events => throughput is 1.34E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6539s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6476s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for    90112 events => throughput is 1.42E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.049753e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.049281e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.613651e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.529307e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.019403e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.148817e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.060699e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.053163e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.995962e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.170472e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.142982e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.130394e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.026315e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.186789e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.022885e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.035076e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index d992721ecf..09e16e6057 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -2,12 +2,12 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
+
 make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
-
-make USEBUILDDIR=1 AVX=512y
 make USEBUILDDIR=1 AVX=avx2
+make USEBUILDDIR=1 AVX=512y
 
 make USEBUILDDIR=1 AVX=512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
@@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:54:21
+DATE: 2023-11-08_22:10:24
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 42 events (found 469 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5463s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2264s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3199s for     8192 events => throughput is 2.56E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5436s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2280s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3156s for     8192 events => throughput is 2.60E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5423s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2222s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3201s for     8192 events => throughput is 2.56E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5326s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2186s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3141s for     8192 events => throughput is 2.61E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    4.9241s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4090s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.5151s for    90112 events => throughput is 2.56E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.9133s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4066s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.5067s for    90112 events => throughput is 2.57E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470791E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8783s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5509s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3274s for     8192 events => throughput is 2.50E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8544s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5319s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3225s for     8192 events => throughput is 2.54E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.3304s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7125s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6180s for    90112 events => throughput is 2.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.3255s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7008s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6247s for    90112 events => throughput is 2.49E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.563855e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.590377e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.539633e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.610150e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5609s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3903s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1705s for     8192 events => throughput is 4.80E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5624s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3861s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1763s for     8192 events => throughput is 4.65E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    3.4794s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5811s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8984s for    90112 events => throughput is 4.75E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.3972s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5470s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8502s for    90112 events => throughput is 4.87E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.820475e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.010592e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.874297e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.958333e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3928s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3073s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0854s for     8192 events => throughput is 9.59E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3818s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2982s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0836s for     8192 events => throughput is 9.79E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4294s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4857s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9437s for    90112 events => throughput is 9.55E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3684s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4497s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9187s for    90112 events => throughput is 9.81E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.717012e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.953639e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.756457e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.002866e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3746s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2982s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0765s for     8192 events => throughput is 1.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3650s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2906s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0744s for     8192 events => throughput is 1.10E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3655s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5058s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8597s for    90112 events => throughput is 1.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2634s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4412s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8222s for    90112 events => throughput is 1.10E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.094100e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.117525e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.081248e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.126876e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4370s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3297s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1073s for     8192 events => throughput is 7.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4269s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3231s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1039s for     8192 events => throughput is 7.89E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6869s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5079s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1790s for    90112 events => throughput is 7.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6060s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4689s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1371s for    90112 events => throughput is 7.92E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.730653e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.896705e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.578143e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.740238e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6799s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6745s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0053s for     8192 events => throughput is 1.53E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6527s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6472s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for     8192 events => throughput is 1.50E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8667s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8438s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0229s for    90112 events => throughput is 3.94E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8560s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8329s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0231s for    90112 events => throughput is 3.90E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.611230e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.624902e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.333105e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.902263e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.644038e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.850642e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.240451e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.238047e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.653799e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.868590e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.251657e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.248755e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.651458e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.862444e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.754830e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.745100e+06                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index a339973536..1a98ebc0f5 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.'
 
 
 
+
 make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
-
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:55:03
+DATE: 2023-11-08_22:11:05
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 42 events (found 469 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5498s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2259s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3239s for     8192 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5362s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2200s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3162s for     8192 events => throughput is 2.59E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5475s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2238s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3236s for     8192 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5340s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2185s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3154s for     8192 events => throughput is 2.60E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    4.9843s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4284s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.5559s for    90112 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8590s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3887s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.4703s for    90112 events => throughput is 2.60E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196349765248158E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8606s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5403s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3204s for     8192 events => throughput is 2.56E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8380s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5255s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3125s for     8192 events => throughput is 2.62E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310860767768514E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.2449s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7120s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5329s for    90112 events => throughput is 2.55E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.1166s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6696s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.4470s for    90112 events => throughput is 2.61E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.612374e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.677117e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.564881e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.693750e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196334183509370E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4339s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3327s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1012s for     8192 events => throughput is 8.10E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4030s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3096s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0934s for     8192 events => throughput is 8.77E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310847547651041E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5445s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4937s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0508s for    90112 events => throughput is 8.58E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4739s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4457s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0282s for    90112 events => throughput is 8.76E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.676181e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.839523e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.776153e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.853955e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3149s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2698s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0450s for     8192 events => throughput is 1.82E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3025s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2591s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0434s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9260s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4419s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4841s for    90112 events => throughput is 1.86E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8724s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4009s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4715s for    90112 events => throughput is 1.91E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.865505e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.919418e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.837629e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.922480e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3024s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2625s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0399s for     8192 events => throughput is 2.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2944s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2562s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0382s for     8192 events => throughput is 2.14E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8768s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4395s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4373s for    90112 events => throughput is 2.06E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8215s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3936s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4279s for    90112 events => throughput is 2.11E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.065719e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.114883e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.103855e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.107711e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196344079460428E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3291s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2768s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0523s for     8192 events => throughput is 1.57E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3218s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2710s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0508s for     8192 events => throughput is 1.61E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310857804286998E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.0319s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4573s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5745s for    90112 events => throughput is 1.57E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9668s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4146s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5522s for    90112 events => throughput is 1.63E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.561181e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.619298e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.560141e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.625264e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196349366365994E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6502s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6494s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.50E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6443s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6435s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.56E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310864949473968E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8485s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8390s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0096s for    90112 events => throughput is 9.41E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7852s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7757s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for    90112 events => throughput is 9.54E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.292780e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.275339e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.862148e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.852966e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.637111e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.672301e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.443658e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.329588e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.653596e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.661199e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.515346e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.474053e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.504423e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.511679e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.620516e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.616407e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 0d971ecde6..b41396f75b 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-
 make USEBUILDDIR=1 AVX=none
 
+
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:55:40
+DATE: 2023-11-08_22:11:42
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 42 events (found 469 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5559s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2317s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3242s for     8192 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5361s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2201s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3160s for     8192 events => throughput is 2.59E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5470s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2235s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3235s for     8192 events => throughput is 2.53E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5352s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2185s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3167s for     8192 events => throughput is 2.59E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    4.9714s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4219s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.5496s for    90112 events => throughput is 2.54E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8603s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3884s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.4719s for    90112 events => throughput is 2.60E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358763382007E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8877s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5532s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3345s for     8192 events => throughput is 2.45E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8721s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5420s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3301s for     8192 events => throughput is 2.48E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.5218s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7614s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.7604s for    90112 events => throughput is 2.40E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.2894s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6845s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6049s for    90112 events => throughput is 2.50E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.427313e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.562016e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.496439e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.546299e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358804670396E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5567s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3892s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1675s for     8192 events => throughput is 4.89E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5435s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3795s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1640s for     8192 events => throughput is 4.99E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    3.4587s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5767s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8820s for    90112 events => throughput is 4.79E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.3591s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5386s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8206s for    90112 events => throughput is 4.95E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.968795e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.765208e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.959892e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.784106e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3947s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3085s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0862s for     8192 events => throughput is 9.51E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4043s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3156s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0888s for     8192 events => throughput is 9.23E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4507s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4993s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9513s for    90112 events => throughput is 9.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3898s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4601s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9297s for    90112 events => throughput is 9.69E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.685236e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.002689e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.962312e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.001815e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3728s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2978s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0750s for     8192 events => throughput is 1.09E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3655s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2923s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0732s for     8192 events => throughput is 1.12E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2949s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4713s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8237s for    90112 events => throughput is 1.09E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2429s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4365s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8064s for    90112 events => throughput is 1.12E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.124486e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.134514e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.126890e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.146843e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358757578441E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4452s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3335s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1117s for     8192 events => throughput is 7.34E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4312s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3234s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1077s for     8192 events => throughput is 7.60E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872803699391E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.7230s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5103s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2127s for    90112 events => throughput is 7.43E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6602s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4811s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1791s for    90112 events => throughput is 7.64E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.441126e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.628154e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.419166e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.726777e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,8 +514,8 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358102981245E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6594s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6540s
+ [COUNTERS] PROGRAM TOTAL          :    0.6526s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6472s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0054s for     8192 events => throughput is 1.51E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8526s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8298s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0228s for    90112 events => throughput is 3.94E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8190s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7961s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0229s for    90112 events => throughput is 3.93E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.626262e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.619555e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.888012e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.404025e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.627419e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.847979e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.234131e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.233328e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.606969e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.825056e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.246896e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.244373e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.626608e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.833245e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.728520e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.724277e+06                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index ba8c60f62e..e6041006eb 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_19:56:23
+DATE: 2023-11-08_22:12:23
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 48 events (found 439 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4568s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2815s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1753s for     8192 events => throughput is 1.96E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3823s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2780s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1043s for     8192 events => throughput is 2.00E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5175s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2787s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2387s for     8192 events => throughput is 1.93E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3581s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2728s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.0853s for     8192 events => throughput is 2.01E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   48.0120s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9235s
- [COUNTERS] Fortran MEs      ( 1 ) :   46.0885s for    90112 events => throughput is 1.96E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.0624s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8747s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.1877s for    90112 events => throughput is 1.99E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    8.7799s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.4663s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3136s for     8192 events => throughput is 1.90E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.6032s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.3774s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2258s for     8192 events => throughput is 1.94E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   53.8857s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.1301s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   47.7557s for    90112 events => throughput is 1.89E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   52.5656s
+ [COUNTERS] Fortran Overhead ( 0 ) :    5.9752s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   46.5903s for    90112 events => throughput is 1.93E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.953970e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.002618e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.950653e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.000666e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.8228s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.5191s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3037s for     8192 events => throughput is 3.56E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.6983s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.4517s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.2466s for     8192 events => throughput is 3.65E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   29.7001s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.1956s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.5045s for    90112 events => throughput is 3.53E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   29.0395s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1529s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.8866s for    90112 events => throughput is 3.62E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.686347e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.775162e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.681541e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.752647e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2608s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2531s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0077s for     8192 events => throughput is 8.13E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2090s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2291s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9799s for     8192 events => throughput is 8.36E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   13.8799s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.8850s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   10.9950s for    90112 events => throughput is 8.20E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   13.6019s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.8333s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   10.7686s for    90112 events => throughput is 8.37E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.425637e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.622945e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.448586e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.637406e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.0082s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1311s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8771s for     8192 events => throughput is 9.34E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9647s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1065s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8583s for     8192 events => throughput is 9.54E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   12.4208s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7744s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.6464s for    90112 events => throughput is 9.34E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   12.2465s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.7171s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.5294s for    90112 events => throughput is 9.46E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.625406e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.867536e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.599473e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.834174e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4768s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3764s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1005s for     8192 events => throughput is 7.44E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4062s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3349s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0712s for     8192 events => throughput is 7.65E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   15.3944s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.0207s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.3737s for    90112 events => throughput is 7.28E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   14.7127s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.9424s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.7703s for    90112 events => throughput is 7.66E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.487218e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.671946e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.501573e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.485706e+03                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8150s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7821s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0330s for     8192 events => throughput is 2.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8073s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7752s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0321s for     8192 events => throughput is 2.55E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    2.7813s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4228s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3586s for    90112 events => throughput is 2.51E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7243s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3746s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3497s for    90112 events => throughput is 2.58E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.281506e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.290435e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.519229e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.518069e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.106281e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.109074e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.149081e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.162766e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.098811e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.119359e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.169654e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.170946e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.104970e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.114486e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.438070e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.433160e+05                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 2c58d8399d..a18920ba3f 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
+
 make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
-
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:00:40
+DATE: 2023-11-08_22:16:35
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 48 events (found 439 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4730s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2806s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1924s for     8192 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4492s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2726s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1766s for     8192 events => throughput is 1.96E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4924s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2814s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2110s for     8192 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3607s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2703s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.0903s for     8192 events => throughput is 2.00E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   48.0870s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9193s
- [COUNTERS] Fortran MEs      ( 1 ) :   46.1676s for    90112 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.0727s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8744s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.1984s for    90112 events => throughput is 1.99E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277396490802749E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    8.5167s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.3246s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1920s for     8192 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.3702s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.2240s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1462s for     8192 events => throughput is 1.98E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803774602344628E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   52.0969s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.9741s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   46.1228s for    90112 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   50.9666s
+ [COUNTERS] Fortran Overhead ( 0 ) :    5.8905s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   45.0761s for    90112 events => throughput is 2.00E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.036738e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.075529e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.035901e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.074082e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277389126121586E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5366s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3964s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1402s for     8192 events => throughput is 7.18E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5244s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3710s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1534s for     8192 events => throughput is 7.10E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803771887543366E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   15.6834s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.0490s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.6344s for    90112 events => throughput is 7.13E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.2999s
+ [COUNTERS] Fortran Overhead ( 0 ) :    3.0272s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.2727s for    90112 events => throughput is 7.34E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.385848e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.487987e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.336063e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.461964e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2706s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7693s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5013s for     8192 events => throughput is 1.63E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.2522s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7532s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4990s for     8192 events => throughput is 1.64E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    7.9611s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4052s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.5558s for    90112 events => throughput is 1.62E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.8843s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3862s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4981s for    90112 events => throughput is 1.64E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.671775e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.703770e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.674155e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.715659e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.1460s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7047s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4413s for     8192 events => throughput is 1.86E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.1254s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6948s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4306s for     8192 events => throughput is 1.90E+04 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    7.1917s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3395s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.8523s for    90112 events => throughput is 1.86E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.0325s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.2899s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.7425s for    90112 events => throughput is 1.90E+04 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.912795e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.946675e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.909696e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.957212e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277396394633404E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3662s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8206s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5456s for     8192 events => throughput is 1.50E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3221s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7944s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5278s for     8192 events => throughput is 1.55E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803777741065333E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    8.4389s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4516s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.9874s for    90112 events => throughput is 1.51E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.1973s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3930s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.8043s for    90112 events => throughput is 1.55E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.534307e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.558982e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.484518e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.568288e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277400478491260E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7763s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7549s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0214s for     8192 events => throughput is 3.82E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7705s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7491s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0214s for     8192 events => throughput is 3.83E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6207s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3864s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2342s for    90112 events => throughput is 3.85E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5805s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3447s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2358s for    90112 events => throughput is 3.82E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.582914e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.598757e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.939400e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.937809e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.483584e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.495923e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.662803e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.725491e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.489429e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.498449e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.631443e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.660457e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.463590e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.473649e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.531910e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.522099e+05                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index 7032d72896..05db57554d 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
 
+make USEBUILDDIR=1 AVX=none
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -16,16 +16,16 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:04:02
+DATE: 2023-11-08_22:19:53
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 48 events (found 439 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4626s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2774s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1852s for     8192 events => throughput is 1.96E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3676s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2775s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.0901s for     8192 events => throughput is 2.00E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4427s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2777s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1649s for     8192 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4195s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2705s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1489s for     8192 events => throughput is 1.97E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   48.3675s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9183s
- [COUNTERS] Fortran MEs      ( 1 ) :   46.4493s for    90112 events => throughput is 1.94E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.1152s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8703s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.2450s for    90112 events => throughput is 1.99E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    9.0356s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.6432s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3924s for     8192 events => throughput is 1.87E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.7049s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4327s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2722s for     8192 events => throughput is 1.92E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725813026109E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   54.3841s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.2075s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   48.1766s for    90112 events => throughput is 1.87E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   53.0960s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.0891s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   47.0069s for    90112 events => throughput is 1.92E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.891623e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.971437e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.924168e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.965809e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277430934464E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.7893s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.5036s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.2857s for     8192 events => throughput is 3.58E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.7042s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.4800s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.2242s for     8192 events => throughput is 3.68E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725816246317E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   29.4680s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.1631s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.3048s for    90112 events => throughput is 3.56E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   28.5105s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.0554s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.4551s for    90112 events => throughput is 3.68E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.703810e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.800834e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.713606e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.788503e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2254s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2372s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9882s for     8192 events => throughput is 8.29E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.1858s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2226s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9633s for     8192 events => throughput is 8.50E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   13.9261s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.8936s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.0324s for    90112 events => throughput is 8.17E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   13.5514s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.8252s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   10.7262s for    90112 events => throughput is 8.40E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.503062e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.756273e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.519397e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.759413e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.0128s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1323s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8805s for     8192 events => throughput is 9.30E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9510s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0980s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8530s for     8192 events => throughput is 9.60E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   12.3871s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7750s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.6121s for    90112 events => throughput is 9.37E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   12.1748s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.7107s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4641s for    90112 events => throughput is 9.52E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.683983e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.859146e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.679001e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.890303e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5013s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3879s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1134s for     8192 events => throughput is 7.36E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4412s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3447s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0965s for     8192 events => throughput is 7.47E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   15.3721s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.0357s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.3363s for    90112 events => throughput is 7.30E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   14.7703s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.9437s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.8266s for    90112 events => throughput is 7.62E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.423059e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.668015e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.425324e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.694387e+03                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277293084707E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8158s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7828s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0331s for     8192 events => throughput is 2.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8048s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7728s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0320s for     8192 events => throughput is 2.56E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    2.7756s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4130s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3626s for    90112 events => throughput is 2.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7246s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3746s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3499s for    90112 events => throughput is 2.58E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.294705e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.280245e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.524485e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.525176e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.113307e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.116522e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.174133e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.157499e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.119833e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.119956e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.183136e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.172287e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.103258e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.122850e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.436179e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.440669e+05                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 568f545851..b972c40fa5 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-
-
 make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
+
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:09:47
+DATE: 2023-11-08_22:25:30
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -51,14 +51,552 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
-ERROR! ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' failed
-d R # 5  >    -0.0    -0.0    -0.0     0.4     0.4
-d R # 6  >    -0.0    -0.0    -0.0    -0.0     0.4
-s min # 3>     0.0119716.0 29929.0 29929.0     0.0
-s min # 4>     0.0     0.0 29929.0 29929.0     0.0
-s min # 5>     0.0     0.0     0.0     0.0     0.0
-s min # 6>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 3>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 4>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 5>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 6>     0.0     0.0     0.0     0.0     0.0
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 1 events (found 166 events)
+ [COUNTERS] PROGRAM TOTAL          :   95.8408s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4545s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.3863s for     8192 events => throughput is 8.59E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   95.5040s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4512s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.0528s for     8192 events => throughput is 8.62E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1050.5151s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1583s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1046.3568s for    90112 events => throughput is 8.61E+01 events/s
+
+*** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :  216.0448s
+ [COUNTERS] Fortran Overhead ( 0 ) :   99.5423s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  116.5025s for     8192 events => throughput is 7.03E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.4424906541753444e-15)
+
+*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813953E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1395.0826s
+ [COUNTERS] Fortran Overhead ( 0 ) :  101.4573s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1293.6254s for    90112 events => throughput is 6.97E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813953E-007) differ by less than 2E-14 (1.1102230246251565e-15)
+
+*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.294341e+01                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.275454e+01                 )  sec^-1
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :  107.3938s
+ [COUNTERS] Fortran Overhead ( 0 ) :   49.4703s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   57.9235s for     8192 events => throughput is 1.41E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (2.220446049250313e-15)
+
+*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  689.6088s
+ [COUNTERS] Fortran Overhead ( 0 ) :   53.6676s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  635.9412s for    90112 events => throughput is 1.42E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16)
+
+*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.663387e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.670748e+02                 )  sec^-1
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   50.5726s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.0971s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   27.4754s for     8192 events => throughput is 2.98E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15)
+
+*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  326.9697s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.6301s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  300.3396s for    90112 events => throughput is 3.00E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16)
+
+*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.612820e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.630261e+02                 )  sec^-1
+
+*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   44.4764s
+ [COUNTERS] Fortran Overhead ( 0 ) :   20.3120s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.1644s for     8192 events => throughput is 3.39E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15)
+
+*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  289.1902s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.9124s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  265.2778s for    90112 events => throughput is 3.40E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16)
+
+*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.088132e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.127446e+02                 )  sec^-1
+
+*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   45.6965s
+ [COUNTERS] Fortran Overhead ( 0 ) :   22.1825s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.5139s for     8192 events => throughput is 3.48E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15)
+
+*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  283.5251s
+ [COUNTERS] Fortran Overhead ( 0 ) :   25.9112s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  257.6139s for    90112 events => throughput is 3.50E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16)
+
+*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.741805e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.777930e+02                 )  sec^-1
+
+*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435838E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :    4.1875s
+ [COUNTERS] Fortran Overhead ( 0 ) :    3.1069s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0806s for     8192 events => throughput is 7.58E+03 events/s
+
+*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435838E-006) differ by less than 2E-14 (3.1086244689504383e-15)
+
+*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
+
+*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :   18.7118s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.8168s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.8950s for    90112 events => throughput is 7.58E+03 events/s
+
+*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16)
+
+*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.523661e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.283120e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.266218e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.591927e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.251570e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.476794e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.262349e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.252080e+03                 )  sec^-1
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index e844ee5b79..3ca211fa85 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.'
 
 
 
-make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:09:50
+DATE: 2023-11-08_23:51:54
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -51,14 +51,552 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
-ERROR! ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' failed
-d R # 5  >    -0.0    -0.0    -0.0     0.4     0.4
-d R # 6  >    -0.0    -0.0    -0.0    -0.0     0.4
-s min # 3>     0.0119716.0 29929.0 29929.0     0.0
-s min # 4>     0.0     0.0 29929.0 29929.0     0.0
-s min # 5>     0.0     0.0     0.0     0.0     0.0
-s min # 6>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 3>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 4>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 5>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 6>     0.0     0.0     0.0     0.0     0.0
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 1 events (found 166 events)
+ [COUNTERS] PROGRAM TOTAL          :   95.6648s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4545s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.2103s for     8192 events => throughput is 8.60E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   95.3879s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4521s
+ [COUNTERS] Fortran MEs      ( 1 ) :   94.9358s for     8192 events => throughput is 8.63E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1051.3512s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1998s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1047.1514s for    90112 events => throughput is 8.61E+01 events/s
+
+*** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694768344939596E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :  195.2840s
+ [COUNTERS] Fortran Overhead ( 0 ) :   89.6572s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  105.6269s for     8192 events => throughput is 7.76E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694768344939596E-006) differ by less than 4E-4 (0.00014259686216466783)
+
+*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361436150871156E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1253.2021s
+ [COUNTERS] Fortran Overhead ( 0 ) :   93.4786s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1159.7235s for    90112 events => throughput is 7.77E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361436150871156E-007) differ by less than 4E-4 (0.00014045934987350073)
+
+*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.188520e+01                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.207566e+01                 )  sec^-1
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694765850750953E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   48.9590s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.2330s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   25.7260s for     8192 events => throughput is 3.18E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694765850750953E-006) differ by less than 4E-4 (0.00014238355787066226)
+
+*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361430669586527E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  312.4727s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.8498s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  285.6229s for    90112 events => throughput is 3.15E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430669586527E-007) differ by less than 4E-4 (0.00014020271663550687)
+
+*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.595667e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.615224e+02                 )  sec^-1
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   25.4046s
+ [COUNTERS] Fortran Overhead ( 0 ) :   11.8022s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   13.6023s for     8192 events => throughput is 6.02E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694764951124567E-006) differ by less than 4E-4 (0.00014230662135994443)
+
+*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  164.6743s
+ [COUNTERS] Fortran Overhead ( 0 ) :   15.5764s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  149.0979s for    90112 events => throughput is 6.04E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430425531218E-007) differ by less than 4E-4 (0.0001401912899885449)
+
+*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.233727e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.144603e+02                 )  sec^-1
+
+*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   22.4388s
+ [COUNTERS] Fortran Overhead ( 0 ) :   10.5095s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9293s for     8192 events => throughput is 6.87E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694764951124567E-006) differ by less than 4E-4 (0.00014230662135994443)
+
+*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  145.9227s
+ [COUNTERS] Fortran Overhead ( 0 ) :   13.9719s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  131.9508s for    90112 events => throughput is 6.83E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430425531218E-007) differ by less than 4E-4 (0.0001401912899885449)
+
+*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.277686e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.316223e+02                 )  sec^-1
+
+*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694767957195604E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   22.8899s
+ [COUNTERS] Fortran Overhead ( 0 ) :   11.3435s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.5464s for     8192 events => throughput is 7.09E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694767957195604E-006) differ by less than 4E-4 (0.00014256370209930758)
+
+*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361435956349820E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  142.7065s
+ [COUNTERS] Fortran Overhead ( 0 ) :   14.9424s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  127.7641s for    90112 events => throughput is 7.05E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435956349820E-007) differ by less than 4E-4 (0.00014045024240250115)
+
+*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.537880e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 7.497574e+02                 )  sec^-1
+
+*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1694770708195000E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :    2.4801s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9879s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4922s for     8192 events => throughput is 1.66E+04 events/s
+
+*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694770708195000E-006) differ by less than 4E-4 (0.00014279896898083955)
+
+*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
+
+*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1361443477565659E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :   11.0377s
+ [COUNTERS] Fortran Overhead ( 0 ) :    5.5836s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4541s for    90112 events => throughput is 1.65E+04 events/s
+
+*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361443477565659E-007) differ by less than 4E-4 (0.0001408023850304474)
+
+*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.639292e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.626171e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.329585e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.369301e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.304460e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.376586e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.333260e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 6.421151e+03                 )  sec^-1
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 43bf5072f2..2729351c42 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -1,11 +1,11 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 CUDACPP_BUILDDIR='.'
 
-
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
 
+make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:09:53
+DATE: 2023-11-09_00:57:00
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -51,14 +51,552 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
-ERROR! ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' failed
-d R # 5  >    -0.0    -0.0    -0.0     0.4     0.4
-d R # 6  >    -0.0    -0.0    -0.0    -0.0     0.4
-s min # 3>     0.0119716.0 29929.0 29929.0     0.0
-s min # 4>     0.0     0.0 29929.0 29929.0     0.0
-s min # 5>     0.0     0.0     0.0     0.0     0.0
-s min # 6>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 3>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 4>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 5>     0.0     0.0     0.0     0.0     0.0
-xqcutij # 6>     0.0     0.0     0.0     0.0     0.0
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 1 events (found 166 events)
+ [COUNTERS] PROGRAM TOTAL          :   95.3917s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4564s
+ [COUNTERS] Fortran MEs      ( 1 ) :   94.9352s for     8192 events => throughput is 8.63E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   95.2404s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4495s
+ [COUNTERS] Fortran MEs      ( 1 ) :   94.7909s for     8192 events => throughput is 8.64E+01 events/s
+
+*** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1049.6483s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1482s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1045.5001s for    90112 events => throughput is 8.62E+01 events/s
+
+*** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693101016896846E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :  223.2377s
+ [COUNTERS] Fortran Overhead ( 0 ) :  102.8564s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  120.3813s for     8192 events => throughput is 6.81E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101016896846E-006) differ by less than 2E-4 (6.111385175699979e-09)
+
+*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          : 1425.5713s
+ [COUNTERS] Fortran Overhead ( 0 ) :  106.5194s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1319.0519s for    90112 events => throughput is 6.83E+01 events/s
+
+*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436275882778E-007) differ by less than 2E-4 (5.48115042242614e-09)
+
+*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.033155e+01                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 8.028364e+01                 )  sec^-1
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :  110.1179s
+ [COUNTERS] Fortran Overhead ( 0 ) :   50.7873s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   59.3305s for     8192 events => throughput is 1.38E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101020910778E-006) differ by less than 2E-4 (6.454658807442115e-09)
+
+*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436284111598E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  704.2691s
+ [COUNTERS] Fortran Overhead ( 0 ) :   54.2949s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  649.9742s for    90112 events => throughput is 1.39E+02 events/s
+
+*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436284111598E-007) differ by less than 2E-4 (5.866422903011426e-09)
+
+*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.635297e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.628042e+02                 )  sec^-1
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   48.2204s
+ [COUNTERS] Fortran Overhead ( 0 ) :   21.9374s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.2831s for     8192 events => throughput is 3.12E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09)
+
+*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  314.3646s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.1162s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  288.2484s for    90112 events => throughput is 3.13E+02 events/s
+
+*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09)
+
+*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.810528e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.825565e+02                 )  sec^-1
+
+*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   42.6054s
+ [COUNTERS] Fortran Overhead ( 0 ) :   19.4149s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.1905s for     8192 events => throughput is 3.53E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09)
+
+*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  278.0352s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.0285s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  255.0067s for    90112 events => throughput is 3.53E+02 events/s
+
+*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09)
+
+*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.372569e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.390556e+02                 )  sec^-1
+
+*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :   44.8365s
+ [COUNTERS] Fortran Overhead ( 0 ) :   21.9299s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   22.9066s for     8192 events => throughput is 3.58E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09)
+
+*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :  280.1799s
+ [COUNTERS] Fortran Overhead ( 0 ) :   25.4637s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  254.7162s for    90112 events => throughput is 3.54E+02 events/s
+
+*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09)
+
+*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.829822e+02                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.840554e+02                 )  sec^-1
+
+*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1
+ [UNWEIGHT] Wrote 15 events (found 163 events)
+ [COUNTERS] PROGRAM TOTAL          :    3.5385s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.6761s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8624s for     8192 events => throughput is 9.50E+03 events/s
+
+*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.2792201459509442e-10)
+
+*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
+
+*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 128/128
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1
+ [UNWEIGHT] Wrote 84 events (found 808 events)
+ [COUNTERS] PROGRAM TOTAL          :   15.7972s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.3222s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4751s for    90112 events => throughput is 9.51E+03 events/s
+
+*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173705990875078e-11)
+
+*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 9.416746e+03                 )  sec^-1
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.082101e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111361e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.159067e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.107992e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.110248e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.116277e+04                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 3.631653e+03                 )  sec^-1
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 2a2ae334de..a53e3fae12 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-make USEBUILDDIR=1 AVX=none
 
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
-make[1]: Nothing to be done for 'all'.
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'make[1]: Nothing to be done for 'all'.
+
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:08:20
+DATE: 2023-11-08_22:24:05
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 78 events (found 561 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3085s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2380s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0705s for     8192 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3033s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2340s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0693s for     8192 events => throughput is 1.18E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3042s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2333s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0708s for     8192 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3022s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2323s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0700s for     8192 events => throughput is 1.17E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2114s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4363s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.7751s for    90112 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.1700s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4093s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7607s for    90112 events => throughput is 1.18E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3922s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3158s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0765s for     8192 events => throughput is 1.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3843s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3087s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0756s for     8192 events => throughput is 1.08E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3858s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5438s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8420s for    90112 events => throughput is 1.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3194s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4974s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8220s for    90112 events => throughput is 1.10E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.080426e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.094809e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.086485e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.102064e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3230s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2818s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0412s for     8192 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3132s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2728s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0404s for     8192 events => throughput is 2.03E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9553s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4982s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4571s for    90112 events => throughput is 1.97E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9124s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4682s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4441s for    90112 events => throughput is 2.03E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.984398e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.028339e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.942353e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.046734e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2839s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2604s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0236s for     8192 events => throughput is 3.47E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2792s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2558s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0234s for     8192 events => throughput is 3.51E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7384s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4779s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2605s for    90112 events => throughput is 3.46E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6989s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4448s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2540s for    90112 events => throughput is 3.55E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.360936e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.552356e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.508331e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.523608e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2789s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2578s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0211s for     8192 events => throughput is 3.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2748s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2536s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0212s for     8192 events => throughput is 3.86E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7099s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4763s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2336s for    90112 events => throughput is 3.86E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6795s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4499s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2296s for    90112 events => throughput is 3.93E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.911581e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.842884e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.775740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.986906e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3043s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2722s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0321s for     8192 events => throughput is 2.55E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2949s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2643s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0307s for     8192 events => throughput is 2.67E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8369s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4881s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3488s for    90112 events => throughput is 2.58E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8124s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4679s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3444s for    90112 events => throughput is 2.62E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.489296e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.637628e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.512748e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.616200e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6694s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6687s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.19E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6561s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6555s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.21E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9131s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9051s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0080s for    90112 events => throughput is 1.13E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8543s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8466s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0076s for    90112 events => throughput is 1.18E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.578046e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.567103e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.918680e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.093360e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.385541e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.536245e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.515910e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.495821e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.366310e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.517486e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.781318e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.749421e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.383694e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.528020e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.778819e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.773747e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 76ba714558..8d2e1984e4 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.'
 
 
 
-make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:08:49
+DATE: 2023-11-08_22:24:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 78 events (found 561 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3082s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2374s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0709s for     8192 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3107s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2407s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0700s for     8192 events => throughput is 1.17E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3089s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2376s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0713s for     8192 events => throughput is 1.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2965s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2276s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0689s for     8192 events => throughput is 1.19E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2176s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4416s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.7760s for    90112 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.1583s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4006s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7577s for    90112 events => throughput is 1.19E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050316058770007] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3831s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3106s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0725s for     8192 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3794s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3064s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0730s for     8192 events => throughput is 1.12E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182797520666] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3282s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5337s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7945s for    90112 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5714s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7649s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8065s for    90112 events => throughput is 1.12E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.150985e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.157942e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.152855e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.172513e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050313133963987] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2893s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2630s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0263s for     8192 events => throughput is 3.11E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2818s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2568s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0250s for     8192 events => throughput is 3.28E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801179276862181] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7627s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4776s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2851s for    90112 events => throughput is 3.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7271s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4513s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2758s for    90112 events => throughput is 3.27E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.058447e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.237957e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.117460e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.272249e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2617s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2490s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0127s for     8192 events => throughput is 6.47E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2581s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2459s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0121s for     8192 events => throughput is 6.75E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6264s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4861s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1403s for    90112 events => throughput is 6.42E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5730s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4380s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1350s for    90112 events => throughput is 6.67E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.320941e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.530818e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.304004e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.313362e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2672s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2557s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0115s for     8192 events => throughput is 7.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2561s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2447s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0114s for     8192 events => throughput is 7.19E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6213s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4898s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1315s for    90112 events => throughput is 6.85E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5576s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4335s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1240s for    90112 events => throughput is 7.27E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.800881e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.360354e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.852775e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.523552e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2707s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2550s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0157s for     8192 events => throughput is 5.22E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2804s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2629s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0176s for     8192 events => throughput is 4.66E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6597s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4813s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1784s for    90112 events => throughput is 5.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6820s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4993s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1827s for    90112 events => throughput is 4.93E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.682841e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.733153e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.814031e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.992885e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,8 +514,8 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6668s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6663s
+ [COUNTERS] PROGRAM TOTAL          :    0.6547s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6542s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.60E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9031s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8970s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0062s for    90112 events => throughput is 1.46E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8561s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8501s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0059s for    90112 events => throughput is 1.53E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.810157e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.584146e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.442986e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.491850e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.776377e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.856033e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.714442e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.715106e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.784654e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.884678e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.791545e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.799322e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.353442e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.441795e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.984091e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.896004e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index d9f19e3972..19ad35f402 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-03_20:09:17
+DATE: 2023-11-08_22:25:01
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 78 events (found 561 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3076s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2365s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0711s for     8192 events => throughput is 1.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3005s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2310s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0695s for     8192 events => throughput is 1.18E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3048s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2341s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0707s for     8192 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3006s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2306s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0700s for     8192 events => throughput is 1.17E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2173s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4390s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.7783s for    90112 events => throughput is 1.16E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.1678s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4091s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7587s for    90112 events => throughput is 1.19E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3915s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3150s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0766s for     8192 events => throughput is 1.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3817s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3071s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0746s for     8192 events => throughput is 1.10E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182636608796] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4080s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5555s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8525s for    90112 events => throughput is 1.06E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3333s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5088s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8245s for    90112 events => throughput is 1.09E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.026153e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.076052e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.029864e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.093800e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333282657201] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3183s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2784s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0399s for     8192 events => throughput is 2.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3088s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2701s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0388s for     8192 events => throughput is 2.11E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182636608810] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9529s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5041s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4488s for    90112 events => throughput is 2.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8975s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4677s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4298s for    90112 events => throughput is 2.10E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.013366e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.015345e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.020889e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.988560e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2872s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2636s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0235s for     8192 events => throughput is 3.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2819s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2586s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0233s for     8192 events => throughput is 3.52E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7508s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4909s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2599s for    90112 events => throughput is 3.47E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7744s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5086s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2658s for    90112 events => throughput is 3.39E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.380135e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.485253e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.471740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.534003e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2817s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2612s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0206s for     8192 events => throughput is 3.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2862s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2651s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0211s for     8192 events => throughput is 3.88E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7107s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4801s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2306s for    90112 events => throughput is 3.91E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6779s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4535s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2245s for    90112 events => throughput is 4.01E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.890792e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.974625e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.973788e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.057698e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3050s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2711s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0339s for     8192 events => throughput is 2.42E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2985s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2664s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0320s for     8192 events => throughput is 2.56E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8573s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4973s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3600s for    90112 events => throughput is 2.50E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8085s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4572s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3513s for    90112 events => throughput is 2.56E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.438047e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.330681e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.395865e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.533534e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333301029693] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6671s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6664s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.19E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6572s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6565s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.23E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182637219935] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8923s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8845s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0077s for    90112 events => throughput is 1.16E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8718s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8641s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0077s for    90112 events => throughput is 1.17E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.584492e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.553454e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.972938e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.988956e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.377134e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.533250e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.496287e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.514727e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.388325e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.523754e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.763560e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.800142e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.382255e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.530148e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.773123e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.776434e+07                 )  sec^-1
 
 TEST COMPLETED

From 7ae4e0460964de0b5e1951c9e8338d75f4ede8c3 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 10:21:17 +0100
Subject: [PATCH 07/14] [actions/gpucpp] reenable testsuite check in the CI

This completed my first version of the gpucpp PR, but I later included also the 3.5.2 upgrade

Revert "[actions/gpucpp] TEMPORARILY disable testsuite on PRs (gh extension install actions/gh-actions-cache gives 'HTTP 403: API rate limit exceeded')"
This reverts commit 1fd1c4c5f493c21c3b271f980571db21c604bc7c.
---
 .github/workflows/testsuite_allprocesses.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/testsuite_allprocesses.yml b/.github/workflows/testsuite_allprocesses.yml
index 662284f944..7eaad09c9f 100644
--- a/.github/workflows/testsuite_allprocesses.yml
+++ b/.github/workflows/testsuite_allprocesses.yml
@@ -15,9 +15,8 @@ on:
   workflow_dispatch:
 
   # Trigger the all-processes workflow for pull requests to master
-  # TEMPORARILY disable these tests on PRs (gh extension install actions/gh-actions-cache gives 'HTTP 403: API rate limit exceeded')
-  ###pull_request:
-  ###  branches: [ master ]
+  pull_request:
+    branches: [ master ]
 
   # Trigger the all-processes workflow when new changes to the workflow are pushed
   push:

From 3ee024de3f3181d3ceae1d6231c35ad08c2a4f24 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 16:56:55 +0100
Subject: [PATCH 08/14] [gpucpp] include Olivier's latest mg5amcnlo changes
 (merged from 3.5.2)

---
 MG5aMC/mg5amcnlo | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
index d7a466dd54..d8c1613ccf 160000
--- a/MG5aMC/mg5amcnlo
+++ b/MG5aMC/mg5amcnlo
@@ -1 +1 @@
-Subproject commit d7a466dd54bb2f57564f5cc674f129ebf095c969
+Subproject commit d8c1613ccf638b5b078a64379e385def5649622c

From 05dd4b2f25f960d3c60f83801b5d1b3c65fce95f Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 10:51:05 +0100
Subject: [PATCH 09/14] [gpucpp] in CODEGEN launch_plugin.py and output.py,
 improve python formatting (cosmetics only)

---
 .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py    |  4 ++--
 .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py   | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index e3f88719f2..fb33465a03 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -203,9 +203,9 @@ def convert_model(self, model, wanted_lorentz=[], wanted_coupling=[]):
     # AV (default from OM's tutorial) - add a debug printout
     def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
         """Typically creating jpeg/HTML output/ compilation/...
-	    cmdhistory is the list of command used so far.
-	    MG5options are all the options of the main interface
-	    outputflags is a list of options provided when doing the output command"""
+           cmdhistory is the list of command used so far.
+           MG5options are all the options of the main interface
+           outputflags is a list of options provided when doing the output command"""
         misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self))
         if self.in_madevent_mode:
             self.add_input_for_banner()
@@ -217,7 +217,7 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
             #if os.system(path + os.sep + 'patchMad.sh ' + self.dir_path + ' PROD ' + patchlevel) != 0:
             #    logger.debug("####### \n stdout is \n %s", stdout)
             #    logger.info("####### \n stderr is \n %s", stderr)
-            #    raise Exception('ERROR! the O/S call to patchMad.sh failed')            
+            #    raise Exception('ERROR! the O/S call to patchMad.sh failed')
             # OLD implementation (SH PR #762)
             #if os.system(PLUGINDIR + os.sep + 'patchMad.sh ' + self.dir_path + ' PROD ' + patchlevel) != 0:
             #    logger.debug("####### \n stdout is \n %s", stdout)
@@ -270,7 +270,7 @@ def add_madevent_plugin_fct(self):
         which contains a series of functions and one dictionary variable TO_OVERWRITE
         that will be used to have temporary overwrite of all the key variable passed as string by their value.
         all variable that are file related should be called as madgraph.dir.file.variable
-        """        
+        """
         plugin_path = os.path.dirname(os.path.realpath( __file__ ))
         files.cp(pjoin(plugin_path, 'launch_plugin.py'), pjoin(self.dir_path, 'bin', 'internal'))
         files.ln(pjoin(self.dir_path, 'lib'),  pjoin(self.dir_path, 'SubProcesses'))
@@ -286,10 +286,10 @@ def change_output_args(args, cmd):
         if 'vector_size' not in ''.join(args):
             args.append('--vector_size=16')
         return args
-    
+
 #------------------------------------------------------------------------------------
 
-class GPU_ProcessExporter(PLUGIN_ProcessExporter):    
+class GPU_ProcessExporter(PLUGIN_ProcessExporter):
     def change_output_args(args, cmd):
         """ """
         cmd._export_format = "madevent"
@@ -298,7 +298,7 @@ def change_output_args(args, cmd):
         if 'vector_size' not in ''.join(args):
             args.append('--vector_size=16384')
         return args
-        
+
     def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
         misc.sprint("enter dedicated function")
         out = super().finalize(matrix_element, cmdhistory, MG5options, outputflag)

From 641754e2d3a5f2a4386d7740d44420dad6c2d6f8 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 17:14:51 +0100
Subject: [PATCH 10/14] [gpucpp] in CODEGEN __init__.py, mark version 3.5.2 as
 validated minimum version

---
 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
index a0cd9dbfb3..82661c6c66 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
@@ -55,6 +55,6 @@
     __author__ = 'Andrea Valassi'
     __email__ = 'andrea.valassi@cern.ch'
     __version__ = (1,0,0)
-    minimal_mg5amcnlo_version = (3,5,1)
+    minimal_mg5amcnlo_version = (3,5,2)
     maximal_mg5amcnlo_version = (1000,1000,1000)
-    latest_validated_version = (3,5,1)
+    latest_validated_version = (3,5,2)

From aae8ef15559e794283328c341b33e4703afc7642 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 17:16:38 +0100
Subject: [PATCH 11/14] [gpucpp] in CODEGEN output.py, remove run_card_class
 again (Olivier has made this unnecessary in 3.5.2)

Revert "[gpucpp] in CODEGEN output.py, add run_card_class to avoid crashes after Olivier's commit 8a18cc242 "better handling of the run_card""
This reverts commit 8c654cf0d35c332e3f4449301f8a8758cc3efce5.
---
 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index fb33465a03..5b557e832a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -149,9 +149,6 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
     ###helas_exporter = None
     helas_exporter = model_handling.PLUGIN_GPUFOHelasCallWriter # this is one of the main fixes for issue #341!
 
-    # AV 08 Nov 2023 add run_card_class to avoid crashes after Olivier's commit 8a18cc242 "better handling of the run_card"
-    run_card_class = None
-
     # AV (default from OM's tutorial) - add a debug printout
     def __init__(self, *args, **kwargs):
         self.in_madevent_mode = False # see MR #747

From 56308e937172b6d66f0bbcf1f9ecb581794cebeb Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 17:22:56 +0100
Subject: [PATCH 12/14] [gpucpp] regenerate all 15 processes after Olivier's
 latest upstream changes, merging to 3.5.2

Most changes are in the version comments (from 3.5.1 to 3.5.2)
There are also some minor changes in genps.f but they look like bug fixes (nincming instead of hardcoded 2)
---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   | 31 ++++----
 .../ee_mumu.mad/Cards/proc_card_mg5.dat       |  2 +-
 epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt    |  2 +-
 .../ee_mumu.mad/SubProcesses/MGVersion.txt    |  2 +-
 .../SubProcesses/P1_epem_mupmum/CPPProcess.cc |  2 +-
 .../SubProcesses/P1_epem_mupmum/CPPProcess.h  |  2 +-
 .../SubProcesses/P1_epem_mupmum/auto_dsig.f   |  2 +-
 .../SubProcesses/P1_epem_mupmum/auto_dsig1.f  |  4 +-
 .../SubProcesses/P1_epem_mupmum/matrix1.f     |  4 +-
 .../cudacpp/ee_mumu.mad/SubProcesses/genps.f  |  4 +-
 .../ee_mumu.mad/bin/internal/__init__.py      |  1 +
 .../ee_mumu.mad/bin/internal/banner.py        |  5 +-
 .../bin/internal/common_run_interface.py      | 17 ++++-
 .../ee_mumu.mad/bin/internal/gen_ximprove.py  | 18 +++--
 .../ee_mumu.mad/bin/internal/launch_plugin.py |  4 +-
 .../bin/internal/madevent_interface.py        | 25 ++++---
 .../cudacpp/ee_mumu.mad/bin/internal/misc.py  |  2 +-
 .../ee_mumu.mad/bin/internal/shower_card.py   | 10 ++-
 epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h   |  2 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.cc  |  2 +-
 .../cudacpp/ee_mumu.mad/src/Parameters_sm.h   |  2 +-
 .../CODEGEN_cudacpp_ee_mumu_log.txt           | 30 ++++----
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc     |  2 +-
 .../P1_Sigma_sm_epem_mupmum/CPPProcess.h      |  2 +-
 epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h    |  2 +-
 .../cudacpp/ee_mumu.sa/src/Parameters_sm.cc   |  2 +-
 epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h |  2 +-
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 35 ++++-----
 .../cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat |  2 +-
 epochX/cudacpp/gg_tt.mad/MGMEVersion.txt      |  2 +-
 .../gg_tt.mad/SubProcesses/MGVersion.txt      |  2 +-
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  2 +-
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  2 +-
 .../SubProcesses/P1_gg_ttx/auto_dsig.f        |  2 +-
 .../SubProcesses/P1_gg_ttx/auto_dsig1.f       |  4 +-
 .../SubProcesses/P1_gg_ttx/matrix1.f          |  4 +-
 epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f |  4 +-
 .../gg_tt.mad/bin/internal/__init__.py        |  1 +
 .../cudacpp/gg_tt.mad/bin/internal/banner.py  |  5 +-
 .../bin/internal/common_run_interface.py      | 17 ++++-
 .../gg_tt.mad/bin/internal/gen_ximprove.py    | 18 +++--
 .../gg_tt.mad/bin/internal/launch_plugin.py   |  4 +-
 .../bin/internal/madevent_interface.py        | 25 ++++---
 epochX/cudacpp/gg_tt.mad/bin/internal/misc.py |  2 +-
 .../gg_tt.mad/bin/internal/shower_card.py     | 10 ++-
 epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h     |  2 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc |  2 +-
 epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h  |  2 +-
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    | 28 +++----
 .../P1_Sigma_sm_gg_ttx/CPPProcess.cc          |  2 +-
 .../P1_Sigma_sm_gg_ttx/CPPProcess.h           |  2 +-
 epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h      |  2 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc  |  2 +-
 epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h   |  2 +-
 .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 39 +++++-----
 .../gg_tt01g.mad/Cards/proc_card_mg5.dat      |  2 +-
 epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt   |  2 +-
 .../gg_tt01g.mad/SubProcesses/MGVersion.txt   |  2 +-
 .../SubProcesses/P1_gg_ttx/CPPProcess.cc      |  2 +-
 .../SubProcesses/P1_gg_ttx/CPPProcess.h       |  2 +-
 .../SubProcesses/P1_gg_ttx/auto_dsig.f        |  2 +-
 .../SubProcesses/P1_gg_ttx/auto_dsig1.f       |  4 +-
 .../SubProcesses/P1_gg_ttx/matrix1.f          |  4 +-
 .../SubProcesses/P2_gg_ttxg/CPPProcess.cc     |  2 +-
 .../SubProcesses/P2_gg_ttxg/CPPProcess.h      |  2 +-
 .../SubProcesses/P2_gg_ttxg/auto_dsig.f       |  2 +-
 .../SubProcesses/P2_gg_ttxg/auto_dsig1.f      |  4 +-
 .../SubProcesses/P2_gg_ttxg/matrix1.f         |  4 +-
 .../cudacpp/gg_tt01g.mad/SubProcesses/genps.f |  4 +-
 .../gg_tt01g.mad/bin/internal/__init__.py     |  1 +
 .../gg_tt01g.mad/bin/internal/banner.py       |  5 +-
 .../bin/internal/common_run_interface.py      | 17 ++++-
 .../gg_tt01g.mad/bin/internal/gen_ximprove.py | 18 +++--
 .../bin/internal/launch_plugin.py             |  4 +-
 .../bin/internal/madevent_interface.py        | 25 ++++---
 .../cudacpp/gg_tt01g.mad/bin/internal/misc.py |  2 +-
 .../gg_tt01g.mad/bin/internal/shower_card.py  | 10 ++-
 epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h  |  2 +-
 .../cudacpp/gg_tt01g.mad/src/Parameters_sm.cc |  2 +-
 .../cudacpp/gg_tt01g.mad/src/Parameters_sm.h  |  2 +-
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     | 37 +++++-----
 .../gg_ttg.mad/Cards/proc_card_mg5.dat        |  2 +-
 epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt     |  2 +-
 .../gg_ttg.mad/SubProcesses/MGVersion.txt     |  2 +-
 .../SubProcesses/P1_gg_ttxg/CPPProcess.cc     |  2 +-
 .../SubProcesses/P1_gg_ttxg/CPPProcess.h      |  2 +-
 .../SubProcesses/P1_gg_ttxg/auto_dsig.f       |  2 +-
 .../SubProcesses/P1_gg_ttxg/auto_dsig1.f      |  4 +-
 .../SubProcesses/P1_gg_ttxg/matrix1.f         |  4 +-
 .../cudacpp/gg_ttg.mad/SubProcesses/genps.f   |  4 +-
 .../gg_ttg.mad/bin/internal/__init__.py       |  1 +
 .../cudacpp/gg_ttg.mad/bin/internal/banner.py |  5 +-
 .../bin/internal/common_run_interface.py      | 17 ++++-
 .../gg_ttg.mad/bin/internal/gen_ximprove.py   | 18 +++--
 .../gg_ttg.mad/bin/internal/launch_plugin.py  |  4 +-
 .../bin/internal/madevent_interface.py        | 25 ++++---
 .../cudacpp/gg_ttg.mad/bin/internal/misc.py   |  2 +-
 .../gg_ttg.mad/bin/internal/shower_card.py    | 10 ++-
 epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h    |  2 +-
 .../cudacpp/gg_ttg.mad/src/Parameters_sm.cc   |  2 +-
 epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h |  2 +-
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  | 28 +++----
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc         |  2 +-
 .../P1_Sigma_sm_gg_ttxg/CPPProcess.h          |  2 +-
 epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h     |  2 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc |  2 +-
 epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h  |  2 +-
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   | 37 +++++-----
 .../gg_ttgg.mad/Cards/proc_card_mg5.dat       |  2 +-
 epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt    |  2 +-
 .../gg_ttgg.mad/SubProcesses/MGVersion.txt    |  2 +-
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc    |  2 +-
 .../SubProcesses/P1_gg_ttxgg/CPPProcess.h     |  2 +-
 .../SubProcesses/P1_gg_ttxgg/auto_dsig.f      |  2 +-
 .../SubProcesses/P1_gg_ttxgg/auto_dsig1.f     |  4 +-
 .../SubProcesses/P1_gg_ttxgg/matrix1.f        |  4 +-
 .../cudacpp/gg_ttgg.mad/SubProcesses/genps.f  |  4 +-
 .../gg_ttgg.mad/bin/internal/__init__.py      |  1 +
 .../gg_ttgg.mad/bin/internal/banner.py        |  5 +-
 .../bin/internal/common_run_interface.py      | 17 ++++-
 .../gg_ttgg.mad/bin/internal/gen_ximprove.py  | 18 +++--
 .../gg_ttgg.mad/bin/internal/launch_plugin.py |  4 +-
 .../bin/internal/madevent_interface.py        | 25 ++++---
 .../cudacpp/gg_ttgg.mad/bin/internal/misc.py  |  2 +-
 .../gg_ttgg.mad/bin/internal/shower_card.py   | 10 ++-
 epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h   |  2 +-
 .../cudacpp/gg_ttgg.mad/src/Parameters_sm.cc  |  2 +-
 .../cudacpp/gg_ttgg.mad/src/Parameters_sm.h   |  2 +-
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           | 32 ++++----
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc        |  2 +-
 .../P1_Sigma_sm_gg_ttxgg/CPPProcess.h         |  2 +-
 epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h    |  2 +-
 .../cudacpp/gg_ttgg.sa/src/Parameters_sm.cc   |  2 +-
 epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h |  2 +-
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 37 +++++-----
 .../gg_ttggg.mad/Cards/proc_card_mg5.dat      |  2 +-
 epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt   |  2 +-
 .../gg_ttggg.mad/SubProcesses/MGVersion.txt   |  2 +-
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc   |  2 +-
 .../SubProcesses/P1_gg_ttxggg/CPPProcess.h    |  2 +-
 .../SubProcesses/P1_gg_ttxggg/auto_dsig.f     |  2 +-
 .../SubProcesses/P1_gg_ttxggg/auto_dsig1.f    |  4 +-
 .../SubProcesses/P1_gg_ttxggg/matrix1.f       |  4 +-
 .../cudacpp/gg_ttggg.mad/SubProcesses/genps.f |  4 +-
 .../gg_ttggg.mad/bin/internal/__init__.py     |  1 +
 .../gg_ttggg.mad/bin/internal/banner.py       |  5 +-
 .../bin/internal/common_run_interface.py      | 17 ++++-
 .../gg_ttggg.mad/bin/internal/gen_ximprove.py | 18 +++--
 .../bin/internal/launch_plugin.py             |  4 +-
 .../bin/internal/madevent_interface.py        | 25 ++++---
 .../cudacpp/gg_ttggg.mad/bin/internal/misc.py |  2 +-
 .../gg_ttggg.mad/bin/internal/shower_card.py  | 10 ++-
 epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h  |  2 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.cc |  2 +-
 .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h  |  2 +-
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          | 30 ++++----
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc       |  2 +-
 .../P1_Sigma_sm_gg_ttxggg/CPPProcess.h        |  2 +-
 epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h   |  2 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.cc  |  2 +-
 .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h   |  2 +-
 .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt     | 35 ++++-----
 .../gq_ttq.mad/Cards/proc_card_mg5.dat        |  2 +-
 epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt     |  2 +-
 .../gq_ttq.mad/SubProcesses/MGVersion.txt     |  2 +-
 .../SubProcesses/P1_gu_ttxu/CPPProcess.cc     |  2 +-
 .../SubProcesses/P1_gu_ttxu/CPPProcess.h      |  2 +-
 .../SubProcesses/P1_gu_ttxu/auto_dsig.f       |  2 +-
 .../SubProcesses/P1_gu_ttxu/auto_dsig1.f      |  4 +-
 .../SubProcesses/P1_gu_ttxu/matrix1.f         |  4 +-
 .../SubProcesses/P1_gux_ttxux/CPPProcess.cc   |  2 +-
 .../SubProcesses/P1_gux_ttxux/CPPProcess.h    |  2 +-
 .../SubProcesses/P1_gux_ttxux/auto_dsig.f     |  2 +-
 .../SubProcesses/P1_gux_ttxux/auto_dsig1.f    |  4 +-
 .../SubProcesses/P1_gux_ttxux/matrix1.f       |  4 +-
 .../cudacpp/gq_ttq.mad/SubProcesses/genps.f   |  4 +-
 .../gq_ttq.mad/bin/internal/__init__.py       |  1 +
 .../cudacpp/gq_ttq.mad/bin/internal/banner.py |  5 +-
 .../bin/internal/common_run_interface.py      | 17 ++++-
 .../gq_ttq.mad/bin/internal/gen_ximprove.py   | 18 +++--
 .../gq_ttq.mad/bin/internal/launch_plugin.py  |  4 +-
 .../bin/internal/madevent_interface.py        | 25 ++++---
 .../cudacpp/gq_ttq.mad/bin/internal/misc.py   |  2 +-
 .../gq_ttq.mad/bin/internal/shower_card.py    | 10 ++-
 epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h    |  2 +-
 .../cudacpp/gq_ttq.mad/src/Parameters_sm.cc   |  2 +-
 epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h |  2 +-
 .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt  | 40 +++++-----
 .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc         |  2 +-
 .../P1_Sigma_sm_gu_ttxu/CPPProcess.h          |  2 +-
 .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc       |  2 +-
 .../P1_Sigma_sm_gux_ttxux/CPPProcess.h        |  2 +-
 epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h     |  2 +-
 epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc |  2 +-
 epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h  |  2 +-
 .../CODEGEN_cudacpp_heft_gg_h_log.txt         | 22 +++---
 .../P1_Sigma_heft_gg_h/CPPProcess.cc          |  2 +-
 .../P1_Sigma_heft_gg_h/CPPProcess.h           |  2 +-
 .../cudacpp/heft_gg_h.sa/src/HelAmps_heft.h   |  2 +-
 .../heft_gg_h.sa/src/Parameters_heft.cc       |  2 +-
 .../heft_gg_h.sa/src/Parameters_heft.h        |  2 +-
 .../CODEGEN_mad_pp_tt012j_log.txt             | 73 ++++++++++---------
 .../pp_tt012j.mad/Cards/proc_card_mg5.dat     |  2 +-
 epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt  |  2 +-
 .../pp_tt012j.mad/SubProcesses/MGVersion.txt  |  2 +-
 .../SubProcesses/P0_gg_ttx/CPPProcess.cc      |  2 +-
 .../SubProcesses/P0_gg_ttx/CPPProcess.h       |  2 +-
 .../SubProcesses/P0_gg_ttx/auto_dsig.f        |  2 +-
 .../SubProcesses/P0_gg_ttx/auto_dsig1.f       |  4 +-
 .../SubProcesses/P0_gg_ttx/matrix1.f          |  4 +-
 .../SubProcesses/P0_uux_ttx/CPPProcess.cc     |  2 +-
 .../SubProcesses/P0_uux_ttx/CPPProcess.h      |  2 +-
 .../SubProcesses/P0_uux_ttx/auto_dsig.f       |  2 +-
 .../SubProcesses/P0_uux_ttx/auto_dsig1.f      |  4 +-
 .../SubProcesses/P0_uux_ttx/matrix1.f         |  4 +-
 .../SubProcesses/P1_gg_ttxg/CPPProcess.cc     |  2 +-
 .../SubProcesses/P1_gg_ttxg/CPPProcess.h      |  2 +-
 .../SubProcesses/P1_gg_ttxg/auto_dsig.f       |  2 +-
 .../SubProcesses/P1_gg_ttxg/auto_dsig1.f      |  4 +-
 .../SubProcesses/P1_gg_ttxg/matrix1.f         |  4 +-
 .../SubProcesses/P1_gu_ttxu/CPPProcess.cc     |  2 +-
 .../SubProcesses/P1_gu_ttxu/CPPProcess.h      |  2 +-
 .../SubProcesses/P1_gu_ttxu/auto_dsig.f       |  2 +-
 .../SubProcesses/P1_gu_ttxu/auto_dsig1.f      |  4 +-
 .../SubProcesses/P1_gu_ttxu/matrix1.f         |  4 +-
 .../SubProcesses/P1_gux_ttxux/CPPProcess.cc   |  2 +-
 .../SubProcesses/P1_gux_ttxux/CPPProcess.h    |  2 +-
 .../SubProcesses/P1_gux_ttxux/auto_dsig.f     |  2 +-
 .../SubProcesses/P1_gux_ttxux/auto_dsig1.f    |  4 +-
 .../SubProcesses/P1_gux_ttxux/matrix1.f       |  4 +-
 .../SubProcesses/P1_uux_ttxg/CPPProcess.cc    |  2 +-
 .../SubProcesses/P1_uux_ttxg/CPPProcess.h     |  2 +-
 .../SubProcesses/P1_uux_ttxg/auto_dsig.f      |  2 +-
 .../SubProcesses/P1_uux_ttxg/auto_dsig1.f     |  4 +-
 .../SubProcesses/P1_uux_ttxg/matrix1.f        |  4 +-
 .../SubProcesses/P2_gg_ttxgg/CPPProcess.cc    |  2 +-
 .../SubProcesses/P2_gg_ttxgg/CPPProcess.h     |  2 +-
 .../SubProcesses/P2_gg_ttxgg/auto_dsig.f      |  2 +-
 .../SubProcesses/P2_gg_ttxgg/auto_dsig1.f     |  4 +-
 .../SubProcesses/P2_gg_ttxgg/matrix1.f        |  4 +-
 .../SubProcesses/P2_gg_ttxuux/CPPProcess.cc   |  2 +-
 .../SubProcesses/P2_gg_ttxuux/CPPProcess.h    |  2 +-
 .../SubProcesses/P2_gg_ttxuux/auto_dsig.f     |  2 +-
 .../SubProcesses/P2_gg_ttxuux/auto_dsig1.f    |  4 +-
 .../SubProcesses/P2_gg_ttxuux/matrix1.f       |  4 +-
 .../SubProcesses/P2_gu_ttxgu/CPPProcess.cc    |  2 +-
 .../SubProcesses/P2_gu_ttxgu/CPPProcess.h     |  2 +-
 .../SubProcesses/P2_gu_ttxgu/auto_dsig.f      |  2 +-
 .../SubProcesses/P2_gu_ttxgu/auto_dsig1.f     |  4 +-
 .../SubProcesses/P2_gu_ttxgu/matrix1.f        |  4 +-
 .../SubProcesses/P2_gux_ttxgux/CPPProcess.cc  |  2 +-
 .../SubProcesses/P2_gux_ttxgux/CPPProcess.h   |  2 +-
 .../SubProcesses/P2_gux_ttxgux/auto_dsig.f    |  2 +-
 .../SubProcesses/P2_gux_ttxgux/auto_dsig1.f   |  4 +-
 .../SubProcesses/P2_gux_ttxgux/matrix1.f      |  4 +-
 .../SubProcesses/P2_uc_ttxuc/CPPProcess.cc    |  2 +-
 .../SubProcesses/P2_uc_ttxuc/CPPProcess.h     |  2 +-
 .../SubProcesses/P2_uc_ttxuc/auto_dsig.f      |  2 +-
 .../SubProcesses/P2_uc_ttxuc/auto_dsig1.f     |  4 +-
 .../SubProcesses/P2_uc_ttxuc/matrix1.f        |  4 +-
 .../SubProcesses/P2_ucx_ttxucx/CPPProcess.cc  |  2 +-
 .../SubProcesses/P2_ucx_ttxucx/CPPProcess.h   |  2 +-
 .../SubProcesses/P2_ucx_ttxucx/auto_dsig.f    |  2 +-
 .../SubProcesses/P2_ucx_ttxucx/auto_dsig1.f   |  4 +-
 .../SubProcesses/P2_ucx_ttxucx/matrix1.f      |  4 +-
 .../SubProcesses/P2_uu_ttxuu/CPPProcess.cc    |  2 +-
 .../SubProcesses/P2_uu_ttxuu/CPPProcess.h     |  2 +-
 .../SubProcesses/P2_uu_ttxuu/auto_dsig.f      |  2 +-
 .../SubProcesses/P2_uu_ttxuu/auto_dsig1.f     |  4 +-
 .../SubProcesses/P2_uu_ttxuu/matrix1.f        |  4 +-
 .../SubProcesses/P2_uux_ttxccx/CPPProcess.cc  |  2 +-
 .../SubProcesses/P2_uux_ttxccx/CPPProcess.h   |  2 +-
 .../SubProcesses/P2_uux_ttxccx/auto_dsig.f    |  2 +-
 .../SubProcesses/P2_uux_ttxccx/auto_dsig1.f   |  4 +-
 .../SubProcesses/P2_uux_ttxccx/matrix1.f      |  4 +-
 .../SubProcesses/P2_uux_ttxgg/CPPProcess.cc   |  2 +-
 .../SubProcesses/P2_uux_ttxgg/CPPProcess.h    |  2 +-
 .../SubProcesses/P2_uux_ttxgg/auto_dsig.f     |  2 +-
 .../SubProcesses/P2_uux_ttxgg/auto_dsig1.f    |  4 +-
 .../SubProcesses/P2_uux_ttxgg/matrix1.f       |  4 +-
 .../SubProcesses/P2_uux_ttxuux/CPPProcess.cc  |  2 +-
 .../SubProcesses/P2_uux_ttxuux/CPPProcess.h   |  2 +-
 .../SubProcesses/P2_uux_ttxuux/auto_dsig.f    |  2 +-
 .../SubProcesses/P2_uux_ttxuux/auto_dsig1.f   |  4 +-
 .../SubProcesses/P2_uux_ttxuux/matrix1.f      |  4 +-
 .../P2_uxcx_ttxuxcx/CPPProcess.cc             |  2 +-
 .../SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h |  2 +-
 .../SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f  |  2 +-
 .../SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f |  4 +-
 .../SubProcesses/P2_uxcx_ttxuxcx/matrix1.f    |  4 +-
 .../P2_uxux_ttxuxux/CPPProcess.cc             |  2 +-
 .../SubProcesses/P2_uxux_ttxuxux/CPPProcess.h |  2 +-
 .../SubProcesses/P2_uxux_ttxuxux/auto_dsig.f  |  2 +-
 .../SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f |  4 +-
 .../SubProcesses/P2_uxux_ttxuxux/matrix1.f    |  4 +-
 .../pp_tt012j.mad/SubProcesses/genps.f        |  4 +-
 .../pp_tt012j.mad/bin/internal/__init__.py    |  1 +
 .../pp_tt012j.mad/bin/internal/banner.py      |  5 +-
 .../bin/internal/common_run_interface.py      | 17 ++++-
 .../bin/internal/gen_ximprove.py              | 18 +++--
 .../bin/internal/launch_plugin.py             |  4 +-
 .../bin/internal/madevent_interface.py        | 25 ++++---
 .../pp_tt012j.mad/bin/internal/misc.py        |  2 +-
 .../pp_tt012j.mad/bin/internal/shower_card.py | 10 ++-
 epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h |  2 +-
 .../pp_tt012j.mad/src/Parameters_sm.cc        |  2 +-
 .../cudacpp/pp_tt012j.mad/src/Parameters_sm.h |  2 +-
 307 files changed, 1017 insertions(+), 753 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index d5d0a77b77..e6546f684c 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005647420883178711 [0m
+[1;32mDEBUG: model prefixing  takes 0.005372047424316406 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -161,10 +161,10 @@ Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
@@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_epem_mupmum 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8e1c521af0> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f64657c44f0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,19 +191,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.100 s
+Wrote files for 8 helas calls in 0.098 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.203 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+ALOHA: aloha creates 3 routines in  0.200 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.267 s
+ALOHA: aloha creates 7 routines in  0.255 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -226,12 +226,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -241,16 +242,16 @@ patching file matrix1.f
 Hunk #3 succeeded at 230 (offset 9 lines).
 Hunk #4 succeeded at 267 (offset 18 lines).
 Hunk #5 succeeded at 312 (offset 18 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.189s
+real	0m4.853s
 user	0m1.653s
-sys	0m0.232s
+sys	0m0.201s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -263,7 +264,7 @@ sys	0m0.232s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -297,7 +298,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
index b9e01f684b..618adbca06 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 0af629d3a8..fc293da1de 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index f2ef5c1e14..77b610753c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
index f78f7c102e..02520466e6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
index fcf2e4dec5..4188745070 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
index 21e300b33e..1991a72bb9 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -319,7 +319,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
index f0d38c2e5a..3995ce8109 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
@@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5942,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index 19819e2451..9fa30cfd7f 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
index 31f620c44e..0b4be4d5ed 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
index 521831ce4a..64d0b8e761 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index ccb39ba2cc..8cb80f0d38 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005671501159667969 [0m
+[1;32mDEBUG: model prefixing  takes 0.005633831024169922 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -160,28 +160,28 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.272 s
+ALOHA: aloha creates 4 routines in  0.267 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -198,9 +198,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m0.795s
-user	0m0.698s
-sys	0m0.066s
+real	0m3.653s
+user	0m0.601s
+sys	0m0.049s
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index c0ab4edb92..684bd53bf5 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index f2ef5c1e14..77b610753c 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
index 19819e2451..9fa30cfd7f 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
index 31f620c44e..0b4be4d5ed 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
index 521831ce4a..64d0b8e761 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index b0eb76c9f4..a1fa47508f 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005650758743286133 [0m
+[1;32mDEBUG: model prefixing  takes 0.005694150924682617 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.009 s
+1 processes with 3 diagrams generated in 0.008 s
 Total: 1 processes with 3 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f30964e0b20> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f560fa1e7c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.107 s
+Wrote files for 10 helas calls in 0.101 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.152 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+ALOHA: aloha creates 2 routines in  0.145 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.140 s
+ALOHA: aloha creates 4 routines in  0.132 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -219,27 +219,28 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.780s
-user	0m1.544s
-sys	0m0.218s
+real	0m4.772s
+user	0m1.470s
+sys	0m0.223s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -252,7 +253,7 @@ sys	0m0.218s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -285,7 +286,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
index 2a2fd25453..4c14989a3f 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 02f655f48c..d2e7a3c91d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 0c2d2b0687..3ebd92c038 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index fe184caddf..d80d770784 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index 5a3da931f2..9346ee4c6a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
index daea73a6df..0c2ce6ec40 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -301,7 +301,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
index f0d38c2e5a..3995ce8109 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
@@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5942,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py b/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index 07d0bfa887..55f43bb43a 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index 3452d1e8da..a9bc93ff98 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index 4f6f322ed9..932f123fea 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 27709b8f4f..805df19bd9 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005872249603271484 [0m
+[1;32mDEBUG: model prefixing  takes 0.00567626953125 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -161,26 +161,26 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.148 s
+ALOHA: aloha creates 2 routines in  0.143 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -193,9 +193,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m0.565s
-user	0m0.498s
-sys	0m0.040s
+real	0m3.529s
+user	0m0.478s
+sys	0m0.048s
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
index 141d1f24ac..0e44ef42c3 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index 0c2d2b0687..3ebd92c038 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
index 07d0bfa887..55f43bb43a 100644
--- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
index 3452d1e8da..a9bc93ff98 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
index 4f6f322ed9..932f123fea 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 0eefbc9b91..9d4dbd85f0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005677461624145508 [0m
+[1;32mDEBUG: model prefixing  takes 0.005400419235229492 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -163,17 +163,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.020 s
+1 processes with 16 diagrams generated in 0.019 s
 Total: 2 processes with 19 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt01g 
 INFO: remove old information in CODEGEN_mad_gg_tt01g 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
@@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P2_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f361f0a0460> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f43c9ec5c70> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f361f0a0580> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f43c9ecb2b0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -217,23 +217,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s
-Wrote files for 46 helas calls in 0.249 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s
+Wrote files for 46 helas calls in 0.242 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.331 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+ALOHA: aloha creates 5 routines in  0.324 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.314 s
+ALOHA: aloha creates 10 routines in  0.308 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -257,12 +257,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -276,16 +277,16 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.345s
-user	0m2.078s
-sys	0m0.243s
+real	0m5.282s
+user	0m2.049s
+sys	0m0.227s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -298,7 +299,7 @@ sys	0m0.243s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -331,7 +332,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
index cdb64729b1..d0845f65f5 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 02f655f48c..d2e7a3c91d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 0c2d2b0687..3ebd92c038 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index fe184caddf..d80d770784 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index 5a3da931f2..9346ee4c6a 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
index daea73a6df..0c2ce6ec40 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -301,7 +301,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
index ce1badffca..1e24c2819d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
index 248ed1ec9e..3901ddcb20 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
index f751e9f14a..53ca75eaf4 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
index 6eb0fa0827..d6c6f42c9e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
index 02f406668c..5c91f2448c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -317,7 +317,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
index f0d38c2e5a..3995ce8109 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
@@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5942,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
index 8995b15c82..361b488401 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index 740186af78..68afa8d9b0 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005747556686401367 [0m
+[1;32mDEBUG: model prefixing  takes 0.005378007888793945 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.022 s
+1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Processing color information for process: g g > t t~ g @1 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f96401c52e0> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fef242fb1f0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -190,23 +190,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s
-Wrote files for 36 helas calls in 0.152 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
+Wrote files for 36 helas calls in 0.148 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.332 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+ALOHA: aloha creates 5 routines in  0.323 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.326 s
+ALOHA: aloha creates 10 routines in  0.309 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -230,12 +230,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -245,16 +246,16 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.221s
-user	0m1.964s
-sys	0m0.245s
+real	0m5.147s
+user	0m1.924s
+sys	0m0.225s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -267,7 +268,7 @@ sys	0m0.245s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -300,7 +301,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
index 3af4991f01..a0ffbbc219 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index f7f5899260..5e2bf0d19a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 9f559fe3ae..37d6ebe981 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
index d528b1d2f0..dd4cd3a0c2 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index 110e204c24..e28575ead8 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index bf665ff6e0..a885b7fde3 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -317,7 +317,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
index f0d38c2e5a..3995ce8109 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
@@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5942,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
index 8995b15c82..361b488401 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index f795e1428d..97056958fe 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0055065155029296875 [0m
+[1;32mDEBUG: model prefixing  takes 0.005817890167236328 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -161,29 +161,29 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
 Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.328 s
+ALOHA: aloha creates 5 routines in  0.326 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -201,9 +201,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m0.870s
-user	0m0.728s
-sys	0m0.055s
+real	0m3.779s
+user	0m0.713s
+sys	0m0.062s
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
index 9393033e26..7f5e51681d 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
index 9f559fe3ae..37d6ebe981 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
index 8995b15c82..361b488401 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 374e4defbb..eacd7a356a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005505084991455078 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053293704986572266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.160 s
+1 processes with 123 diagrams generated in 0.157 s
 Total: 1 processes with 123 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1
 INFO: Processing color information for process: g g > t t~ g g @1 
 INFO: Creating files in directory P1_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f146e0a5730> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f56507006d0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -190,23 +190,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.431 s
-Wrote files for 222 helas calls in 0.704 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.425 s
+Wrote files for 222 helas calls in 0.691 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.335 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+ALOHA: aloha creates 5 routines in  0.333 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.317 s
+ALOHA: aloha creates 10 routines in  0.316 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -233,12 +233,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -248,16 +249,16 @@ Hunk #2 succeeded at 191 (offset 48 lines).
 Hunk #3 succeeded at 269 (offset 48 lines).
 Hunk #4 succeeded at 297 (offset 48 lines).
 Hunk #5 succeeded at 342 (offset 48 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.310s
-user	0m3.061s
-sys	0m0.239s
+real	0m6.262s
+user	0m3.028s
+sys	0m0.232s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -270,7 +271,7 @@ sys	0m0.239s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -303,7 +304,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
index e4d3fe550f..b7568d1a73 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index 896d64343e..57dd4aed47 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
index d681eb7504..04f7c62976 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
index 9d747e6dc1..adf0afbe05 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
index 043887bde3..e4e527260c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
index df931e07c4..272c6bd97d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -349,7 +349,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
index f0d38c2e5a..3995ce8109 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
@@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5942,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
index 9b946c21e1..8df465ad6d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index b1a7fdc7e4..80631c94bf 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005366086959838867 [0m
+[1;32mDEBUG: model prefixing  takes 0.00567317008972168 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.161 s
+1 processes with 123 diagrams generated in 0.157 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.431 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.325 s
+ALOHA: aloha creates 5 routines in  0.318 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -204,9 +204,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m1.466s
-user	0m1.388s
-sys	0m0.064s
+real	0m4.435s
+user	0m1.373s
+sys	0m0.056s
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
index 927a19a802..204439a1dc 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
index d681eb7504..04f7c62976 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
index 9b946c21e1..8df465ad6d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index af1d671efc..ab3974344c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005596160888671875 [0m
+[1;32mDEBUG: model prefixing  takes 0.005319833755493164 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.921 s
+1 processes with 1240 diagrams generated in 1.855 s
 Total: 1 processes with 1240 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg
 INFO: Computing Color-Flow optimization [15120 term] 
 INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa193253fd0> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe481da42e0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -192,23 +192,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.769 s
-Wrote files for 2281 helas calls in 18.847 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.574 s
+Wrote files for 2281 helas calls in 18.431 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.320 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+ALOHA: aloha creates 5 routines in  0.335 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.317 s
+ALOHA: aloha creates 10 routines in  0.313 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -235,12 +235,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -250,16 +251,16 @@ Hunk #2 succeeded at 255 (offset 112 lines).
 Hunk #3 succeeded at 333 (offset 112 lines).
 Hunk #4 succeeded at 361 (offset 112 lines).
 Hunk #5 succeeded at 406 (offset 112 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m29.796s
-user	0m29.282s
-sys	0m0.413s
+real	0m32.103s
+user	0m28.586s
+sys	0m0.412s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -272,7 +273,7 @@ sys	0m0.413s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -305,7 +306,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
index 05d11d495d..2f92ecc4ba 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
index a525c4ba3f..59033d7b2f 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
index dc41720ca6..2565923dde 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
index 2d3c5725be..d2a61fa2ac 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
index 51b8d47520..f22dfbf5e6 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
index ac5285eda5..41dbc97183 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -413,7 +413,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
index f0d38c2e5a..3995ce8109 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
@@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5942,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
index 9b946c21e1..8df465ad6d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 73a2d9596c..33bae20142 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00565791130065918 [0m
+[1;32mDEBUG: model prefixing  takes 0.005532503128051758 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,28 +155,28 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.891 s
+1 processes with 1240 diagrams generated in 1.880 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.621 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.540 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
@@ -204,9 +204,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m13.161s
-user	0m12.961s
-sys	0m0.105s
+real	0m15.959s
+user	0m12.810s
+sys	0m0.102s
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
index a67b74e5b7..30acce4afc 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
index dc41720ca6..2565923dde 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
index 9b946c21e1..8df465ad6d 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 3fcb694ccd..89cb2749b0 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005532264709472656 [0m
+[1;32mDEBUG: model prefixing  takes 0.0057373046875 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,17 +170,17 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.079 s
+8 processes with 40 diagrams generated in 0.078 s
 Total: 8 processes with 40 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gq_ttq 
 INFO: remove old information in CODEGEN_mad_gq_ttq 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
@@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8972f7cbe0> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f395a95ddc0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8973165b80> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f395aad79d0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -231,16 +231,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
-Wrote files for 32 helas calls in 0.224 s
+Wrote files for 32 helas calls in 0.217 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.147 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+ALOHA: aloha creates 2 routines in  0.144 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.133 s
+ALOHA: aloha creates 4 routines in  0.132 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -260,12 +260,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -287,15 +288,15 @@ Hunk #2 succeeded at 162 (offset 19 lines).
 Hunk #3 succeeded at 247 (offset 26 lines).
 Hunk #4 succeeded at 281 (offset 32 lines).
 Hunk #5 succeeded at 326 (offset 32 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.962s
-user	0m1.726s
+real	0m4.915s
+user	0m1.680s
 sys	0m0.237s
 ************************************************************
 *                                                          *
@@ -309,7 +310,7 @@ sys	0m0.237s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -342,7 +343,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
index dc07af3836..efb0752a31 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index c526dd6b31..649c608210 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index cdc2dc91ac..bf037c6c28 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
index 249a3e4e3c..6c1667bc0f 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
index ba39cab867..ee1484ab56 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
index e6d01dad0b..bd8e2f143a 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -333,7 +333,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 8d92e4e769..930da28159 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index a90abc4ab4..0f49f5247b 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
index f2eba72de7..c9b8759b60 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
index 5ec9701b78..62c235de64 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
index 7a2e329e64..4c05be74a0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -333,7 +333,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
index f0d38c2e5a..3995ce8109 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
@@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5942,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
index 0dd5f20f71..cd4e6de668 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
index d5eda63ee0..c06dcbb252 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
index 0c77cf58f0..a6eb185434 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index 06d5354735..16374bd28e 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0056154727935791016 [0m
+[1;32mDEBUG: model prefixing  takes 0.005791902542114258 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,14 +170,14 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.080 s
+8 processes with 40 diagrams generated in 0.078 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
@@ -190,28 +190,28 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1
 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 192][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.146 s
+ALOHA: aloha creates 2 routines in  0.144 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -225,9 +225,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m0.655s
-user	0m0.595s
-sys	0m0.055s
+real	0m3.656s
+user	0m0.594s
+sys	0m0.059s
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
index 037662f7db..4965f393c5 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
index cdc2dc91ac..bf037c6c28 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
index 12179b9801..5024e8e239 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
index a90abc4ab4..0f49f5247b 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
index 0dd5f20f71..cd4e6de668 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
index d5eda63ee0..c06dcbb252 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
index 0c77cf58f0..a6eb185434 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 645c0db954..3b04fc3fb3 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -135,22 +135,22 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_h
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 192][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 193][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 194][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. 
 Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates 1 routines in  0.062 s
@@ -163,9 +163,9 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m0.430s
+real	0m3.422s
 user	0m0.371s
-sys	0m0.055s
+sys	0m0.048s
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
index 6cc0be1461..1d59f8e3cf 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
index d0312182d5..dbc5aa0e4e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
index a2e9b6a70c..eae9ff5242 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
index fde65d5571..e5442756b1 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
index d1a451b2c3..790485fee0 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index 1d0d9e2a35..8b6ca99446 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -14,7 +14,7 @@ Running MG5 in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect         2023-08-08         *
+*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005470752716064453 [0m
+[1;32mDEBUG: model prefixing  takes 0.00538325309753418 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.137 s
+13 processes with 76 diagrams generated in 0.134 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -378,17 +378,17 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.856 s
+65 processes with 1119 diagrams generated in 1.811 s
 Total: 83 processes with 1202 diagrams
 output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 158][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_pp_tt012j 
 INFO: remove old information in CODEGEN_mad_pp_tt012j 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 163][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
@@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED
 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 
 INFO: Creating files in directory P2_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b7c5490> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ed4c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b404130> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b7f0910> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12bc2ff10> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7e7670> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b7f0910> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7e7670> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b7f04f0> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1f047160> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b78b310> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e793a00> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b78b250> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12bc2ff10> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b78b070> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b78b310> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ed4c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12bc2ff10> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1f047160> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b3cf250> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b3a4ac0> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b404130> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7ddac0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b3cf250> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7a9640> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b3cf250> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e79d2e0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa12b404640> [1;30m[export_v4.py at line 6240][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1eb8eee0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -801,23 +801,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.304 s
-Wrote files for 810 helas calls in 3.574 s
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.267 s
+Wrote files for 810 helas calls in 3.215 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.355 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 200][0m [0m
+ALOHA: aloha creates 5 routines in  0.333 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.318 s
+ALOHA: aloha creates 10 routines in  0.312 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -844,12 +844,13 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 209][0m [0m
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
+Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
@@ -1021,16 +1022,16 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 235][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m9.272s
-user	0m8.475s
-sys	0m0.501s
+real	0m11.764s
+user	0m8.242s
+sys	0m0.480s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -1043,7 +1044,7 @@ sys	0m0.501s
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -1076,7 +1077,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.5.1_lo_vect                            *
+*         VERSION 3.5.2_lo_vect                            *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
index 944298ae75..c0b1a2fd98 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.5.1_lo_vect         2023-08-08         *
+#*         VERSION 3.5.2_lo_vect         2023-11-08         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
index 1c1a95761b..85c67c3554 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.5.1_lo_vect
\ No newline at end of file
+3.5.2_lo_vect
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
index 0317bbc95a..30815cd085 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
index ecd2d1364e..448175be9d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
index dce732e252..963d8ec072 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
index a48f6997f3..d4e2956b18 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
index d803e4f19f..5b3b723e59 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -301,7 +301,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
index 75110e8fec..fa46e42b8f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
index 3d5ca9d556..e166fa1652 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
index 3d59efb411..2cc5a2026a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
index f9147f699e..2344ddbe81 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
index 4c21758744..1dea73e826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -304,7 +304,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index f7f5899260..5e2bf0d19a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 9f559fe3ae..37d6ebe981 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
index d528b1d2f0..dd4cd3a0c2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index 110e204c24..e28575ead8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index bf665ff6e0..a885b7fde3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -317,7 +317,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 90a457ac40..3b6b1a6c16 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index cdc2dc91ac..bf037c6c28 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
index 249a3e4e3c..6c1667bc0f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
index ba39cab867..ee1484ab56 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
index d61f0e1a21..b7d8649204 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 9a73b3ed94..eb62f13990 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index a90abc4ab4..0f49f5247b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
index f2eba72de7..c9b8759b60 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
index 5ec9701b78..62c235de64 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
index b082becd2a..8a699645cd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
index dc1a3e9d26..c47ef64ec8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
index 06af307caa..f8bdb38aee 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
index 408403e5d9..628e0d8092 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
index 842b1c72d4..b66a887225 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
index 265f6006db..7bc63ee8a4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -320,7 +320,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
index cbc45ff652..0cbb15fba7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
index a41aa7611a..9f43559181 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
index c23550e9b7..84ee7e5b85 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
index 4e2bfe85ab..aa73f64dba 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -216,7 +216,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
index c8fbb1cc8b..46e6ff0da7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -349,7 +349,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
index 5723ed5665..d9f2d09952 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
index 95f4bf6912..f26b60c5bb 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
index d196e8ed65..abb75a925b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
index e5a0390c47..d6bf2155ff 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -228,7 +228,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
index 4f966fab6d..fabc6786d3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
index b8f74ecafe..0d1c319939 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
index a54b0bb8fe..853175b477 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
index bc732da055..94fe1937c3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
index 309be94a99..50c024adc3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
index c03cebacb0..210884dccf 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
index 2495941a73..8e3985f427 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
index d31dd972a9..e60cb5b6d7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
index 399b68be58..3e0e30af23 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
index 23d82657bf..e639ee4c34 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
index 39422dc34c..a8c5f11ae3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
index 529477ff3e..22398e7ab4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
index 4f557f24ab..5329710b87 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
index da207359fc..94cfdd1487 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
index 4d12dfeade..37f4a35577 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -240,7 +240,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
index 9e27e48c99..66b1820c10 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -354,7 +354,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
index e54a24ea57..3955de70dd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
index 1818cf79ed..391789dc81 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
index cfd6a270b5..5ce83d5f12 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
index 5bac32b00a..ea0697602c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -266,7 +266,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
index 6bdc5db576..9403b67a1a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -360,7 +360,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
index 8638bbefa2..bfc3d0809f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
index 41e15f6ad0..2d95f4b170 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
index efdae70d19..44e8c9d920 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
index 50c16edaac..302d0eda9c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
index 8b2cf62531..f51744ae5d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
index c071cc6900..222800dcfd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
index b93bb3909d..14490d782f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
index 72e76f54e4..ab270fe554 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
index 577a8d9c54..e9b4ddc613 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -266,7 +266,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
index c5a7b6787c..f93b850d5f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -360,7 +360,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
index 2eb6b491fa..ef9407041b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
index 2f4866b6ca..1543c29649 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
index 4b08b69f90..f5ef1f7b43 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
index f4e431c5ce..83e40fb02c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
index a843f4656a..9996fdea2d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
index 8682128442..1aa88699db 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
index dbd5b60487..58cece5c62 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
index 3e29e25982..867eb95566 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
index 123a3ae00e..ae43656176 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
index 6d8f6b4ed8..205e3daf83 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
index 7d3141cfc4..5f356a519e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
index f92e527895..6bd3135c3c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
index 44da6cd9ce..8ded31027d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
index a4cb748b19..7ce014f5f5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -240,7 +240,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
index 53f591633e..dfbec413a8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -354,7 +354,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
index 6ec302f68b..af04d58c3e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
@@ -7,7 +7,7 @@
 // Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
index 53c3b7149b..4e53fa1250 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
@@ -7,7 +7,7 @@
 // Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
index 43ccdff1e1..2acdc960db 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
@@ -359,7 +359,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
index 3a3ed05151..115e19c70e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ALL_CM_RAP, ALL_WGT, IMODE, ALL_OUT, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
index dce10b9553..392b30a39f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -352,7 +352,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+C     Generated by MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
index fe9c61504b..c00e33d954 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
@@ -1877,12 +1877,12 @@ double precision function get_channel_cut(p, config)
          d1 = iforest(1, -i, config)
          d2 = iforest(2, -i, config)
          do j=0,3
-            if (d1.gt.0.and.d1.le.2) then
+            if (d1.gt.0.and.d1.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d1)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d1)
             endif
-            if (d2.gt.0.and.d2.le.2) then
+            if (d2.gt.0.and.d2.le.nincoming) then
                ptemp(j,-i) = ptemp(j,-i) - ptemp(j, d2)
             else
                ptemp(j,-i) = ptemp(j,-i)+ptemp(j, d2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py
index 16e60d8182..0d17042f0d 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/__init__.py
@@ -26,6 +26,7 @@ class aMCatNLOError(MadGraph5Error):
 import os
 import logging
 import time
+pjoin = os.path.join
 
 #Look for basic file position MG5DIR and MG4DIR
 MG5DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
index f0d38c2e5a..3995ce8109 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
@@ -4877,6 +4877,9 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 continue
             break
 
+        if proc_characteristic['ninitial'] == 1:
+            self['SDE_strategy'] =1
+
         if 'MLM' in proc_characteristic['limitations']:
             if self['dynamical_scale_choice'] ==  -1:
                 self['dynamical_scale_choice'] = 3
@@ -5942,7 +5945,7 @@ def default_setup(self):
         self.add_param("CheckCycle", 3)
         self.add_param("MaxAttempts", 10)
         self.add_param("ZeroThres", 1e-9)
-        self.add_param("OSThres", 1.0e-13)
+        self.add_param("OSThres", 1.0e-8)
         self.add_param("DoubleCheckHelicityFilter", True)
         self.add_param("WriteOutFilters", True)
         self.add_param("UseLoopFilter", False)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
index 14c7f310dc..87cb4b88df 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
@@ -4906,6 +4906,7 @@ def __init__(self, question, cards=[], from_banner=None, banner=None, mode='auto
         self.load_default()        
         self.define_paths(**opt)
         self.last_editline_pos = 0
+        self.update_dependent_done = False
 
         if 'allow_arg' not in opt or not opt['allow_arg']:
             # add some mininal content for this:
@@ -6585,7 +6586,9 @@ def postcmd(self, stop, line):
             self.check_card_consistency()
             if self.param_consistency:
                 try:
-                    self.do_update('dependent', timer=20)
+                    if not self.update_dependent_done:
+                        self.do_update('dependent', timer=20)
+                    self.update_dependent_done = False
                 except MadGraph5Error as error:
                     if 'Missing block:' in str(error):
                         self.fail_due_to_format +=1
@@ -6638,6 +6641,8 @@ def do_update(self, line, timer=0):
             self.update_dependent(self.mother_interface, self.me_dir, self.param_card,
                                    self.paths['param'], timer, run_card=self.run_card,
                                    lhapdfconfig=self.lhapdf)
+            self.update_dependent_done = True
+            
 
         elif args[0] == 'missing':
             self.update_missing()
@@ -6717,12 +6722,13 @@ class TimeOutError(Exception):
         def handle_alarm(signum, frame): 
             raise TimeOutError
         signal.signal(signal.SIGALRM, handle_alarm)
+
         if timer:
-            signal.alarm(timer)
             log_level=30
         else:
             log_level=20
 
+
         if run_card:
             as_for_pdf = {'cteq6_m': 0.118,
                           'cteq6_d': 0.118, 
@@ -6781,6 +6787,10 @@ def handle_alarm(signum, frame):
                     logger.log(log_level, "update the strong coupling value (alpha_s) to the value from the pdf selected: %s",  as_for_pdf[pdlabel])
                     modify = True
 
+        if timer:
+            signal.alarm(timer)
+
+
         # Try to load the model in the limited amount of time allowed
         try:
             model = mecmd.get_model()
@@ -6909,7 +6919,8 @@ def check_block(self, blockname):
     def check_answer_consistency(self):
         """function called if the code reads a file"""
         self.check_card_consistency()
-        self.do_update('dependent', timer=20) 
+        if not self.update_dependent_done:
+            self.do_update('dependent', timer=20) 
       
     def help_set(self):
         '''help message for set'''
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
index a88d60b282..5fd170d18d 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
@@ -157,12 +157,16 @@ def get_helicity(self, to_submit=True, clean=True):
             
             (stdout, _) = p.communicate(''.encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            try:
+            if stdout:
                 nb_channel = max([math.floor(float(d)) for d in stdout.split()])
-            except Exception as error:
-                misc.sprint(stdout, 'no channel or error for %s' % Pdir)
-                continue
-
+            else:
+                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                    files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                P_zero_result.append(Pdir)
+                if os.path.exists(pjoin(self.me_dir, 'error')):
+                    os.remove(pjoin(self.me_dir, 'error'))
+                continue # bypass bad process
+            
             self.cmd.compile(['madevent_forhel'], cwd=Pdir)
             if not os.path.exists(pjoin(Pdir, 'madevent_forhel')):
                 raise Exception('Error make madevent_forhel not successful')  
@@ -183,11 +187,13 @@ def get_helicity(self, to_submit=True, clean=True):
             #sym_input = "%(points)d %(iterations)d %(accuracy)f \n" % self.opts
             (stdout, _) = p.communicate(" ".encode())
             stdout = stdout.decode('ascii',errors='ignore')
-            if os.path.exists(pjoin(self.me_dir,'error')):
+            if os.path.exists(pjoin(self.me_dir, 'error')):
                 raise Exception(pjoin(self.me_dir,'error')) 
                 # note a continue is not enough here, we have in top to link
                 # the matrixX_optim.f to matrixX_orig.f to let the code to work
                 # after this error.
+                #                for matrix_file in misc.glob('matrix*orig.f', Pdir):
+                #    files.cp(matrix_file, matrix_file.replace('orig','optim'))
 
             if 'no events passed cuts' in stdout:
                 raise Exception
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
index c9d1c7706a..0b849330ef 100644
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
@@ -57,7 +57,7 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
     def plugin_input(self, finput):
         return
 
@@ -79,7 +79,7 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
index d722702891..853aabc98a 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
@@ -1,4 +1,4 @@
-################################################################################
+###############################################################################
 #
 # Copyright (c) 2011 The MadGraph5_aMC@NLO Development team and Contributors
 #
@@ -3675,7 +3675,7 @@ def do_refine(self, line):
         devnull.close()
     
     ############################################################################ 
-    def do_combine_iteration(self, line):
+    def do_comine_iteration(self, line):
         """Not in help: Combine a given iteration combine_iteration Pdir Gdir S|R step
             S is for survey 
             R is for refine
@@ -3703,7 +3703,7 @@ def do_combine_iteration(self, line):
     ############################################################################ 
     def do_combine_events(self, line):
         """Advanced commands: Launch combine events"""
-
+        start=time.time()
         args = self.split_arg(line)
         start = time.time()
         # Check argument's validity
@@ -3798,9 +3798,7 @@ def do_combine_events(self, line):
             self.correct_bias()
         elif self.run_card['custom_fcts']:
             self.correct_bias()
-
-        logger.info("combine events done in %s", time.time()-start)
-
+        logger.info("combination of events done in %s s ", time.time()-start)
         
         self.to_store.append('event')
     
@@ -7368,7 +7366,7 @@ def wait_monitoring(Idle, Running, Done):
     import optparse
     # Get the directory of the script real path (bin)                                                                                                                                                           
     # and add it to the current PYTHONPATH                                                                                                                                                                      
-    root_path = os.path.dirname(os.path.dirname(os.path.realpath( __file__ )))
+    #root_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath( __file__ ))))
     sys.path.insert(0, root_path)
 
     class MyOptParser(optparse.OptionParser):    
@@ -7411,7 +7409,13 @@ def error(self, msg=''):
     import logging.config
     # Set logging level according to the logging level given by options                                                                                                                                         
     #logging.basicConfig(level=vars(logging)[options.logging])                                                                                                                                                  
+    import internal
     import internal.coloring_logging
+    # internal.file = XXX/bin/internal/__init__.py
+    # => need three dirname to get XXX
+    # we use internal to have any issue with pythonpath finding the wrong file
+    me_dir = os.path.dirname(os.path.dirname(os.path.dirname(internal.__file__)))
+    print("me_dir is", me_dir)
     try:
         if __debug__ and options.logging == 'INFO':
             options.logging = 'DEBUG'
@@ -7419,7 +7423,8 @@ def error(self, msg=''):
             level = int(options.logging)
         else:
             level = eval('logging.' + options.logging)
-        logging.config.fileConfig(os.path.join(root_path, 'internal', 'me5_logging.conf'))
+        log_path = os.path.join(me_dir, 'bin', 'internal', 'me5_logging.conf')
+        logging.config.fileConfig(log_path)
         logging.root.setLevel(level)
         logging.getLogger('madgraph').setLevel(level)
     except:
@@ -7433,9 +7438,9 @@ def error(self, msg=''):
             if '--web' in args:
                 i = args.index('--web') 
                 args.pop(i)                                                                                                                                                                     
-                cmd_line = MadEventCmd(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmd(me_dir, force_run=True)
             else:
-                cmd_line = MadEventCmdShell(os.path.dirname(root_path),force_run=True)
+                cmd_line = MadEventCmdShell(me_dir, force_run=True)
             if not hasattr(cmd_line, 'do_%s' % args[0]):
                 if parser_error:
                     print(parser_error)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py
index d3fed3baa2..91cd3e5c22 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/misc.py
@@ -347,7 +347,7 @@ def tell(msg):
     if dependency=='ninja':
         if cmd.options['ninja'] in ['None',None,''] or\
          (cmd.options['ninja'] == './HEPTools/lib' and not MG5dir is None and\
-         which_lib(pjoin(MG5dir,cmd.options['ninja'],'libninja.a')) is None):
+         which_lib(pjoin(MG5dir,cmd.options['ninja'],'lib','libninja.a')) is None):
             tell("Installing ninja...")
             cmd.do_install('ninja')
  
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py
index c6d3948cc4..c344ea1b15 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/shower_card.py
@@ -45,7 +45,9 @@ class ShowerCard(dict):
     false = ['.false.', 'f', 'false', '0']
     logical_vars = ['ue_enabled', 'hadronize', 'b_stable', 'pi_stable', 'wp_stable', 
                     'wm_stable', 'z_stable', 'h_stable', 'tap_stable', 'tam_stable', 
-                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td']
+                    'mup_stable', 'mum_stable', 'is_4lep', 'is_bbar', 'combine_td',
+                    'space_shower_me_corrections', 'time_shower_me_corrections',
+                    'time_shower_me_extended', 'time_shower_me_after_first']
     string_vars = ['extralibs', 'extrapaths', 'includepaths', 'analyse']
     for i in range(1,100):
         string_vars.append('dm_'+str(i))
@@ -82,7 +84,11 @@ class ShowerCard(dict):
             'b_mass' : {'HERWIG6':'b_mass', 'PYTHIA6': 'b_mass', 'HERWIGPP': 'b_mass', 'PYTHIA8': 'b_mass'},
             'analyse' : {'HERWIG6':'hwuti', 'PYTHIA6':'pyuti', 'HERWIGPP':'hwpputi', 'PYTHIA8':'py8uti'},
             'qcut' : {'PYTHIA8':'qcut'},
-            'njmax' : {'PYTHIA8':'njmax'}}
+            'njmax' : {'PYTHIA8':'njmax'},
+            'space_shower_me_corrections' : {'PYTHIA8':'space_shower_me_corrections'},
+            'time_shower_me_corrections' : {'PYTHIA8':'time_shower_me_corrections'},
+            'time_shower_me_extended' : {'PYTHIA8':'time_shower_me_extended'},
+            'time_shower_me_after_first' : {'PYTHIA8':'time_shower_me_after_first'}}
     
     stdhep_dict = {'HERWIG6':'mcatnlo_hwan_stdhep.o', 'PYTHIA6':'mcatnlo_pyan_stdhep.o'}
     
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
index 9b946c21e1..8df465ad6d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
@@ -8,7 +8,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
index 9d09eb6b62..64fc3fea62 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
index 6b32c66b9b..b6568d3761 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
@@ -7,7 +7,7 @@
 // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.5.1_lo_vect, 2023-08-08
+// MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================

From 381424b526a407732c63fc87fdc497989ba3931b Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Fri, 10 Nov 2023 06:37:21 +0100
Subject: [PATCH 13/14] [gpucpp] rerun 78 tput tests, with FPEs enabled, after
 the upgrade to 3.5.2 - usual failures in ggttg f/m and gqttq f (#783), no
 change in performance

STARTED  AT Thu Nov  9 05:26:21 PM CET 2023
./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean
ENDED(1) AT Thu Nov  9 05:54:46 PM CET 2023 [Status=2]
./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean
ENDED(2) AT Thu Nov  9 06:05:38 PM CET 2023 [Status=0]
./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean
ENDED(3) AT Thu Nov  9 06:15:05 PM CET 2023 [Status=2]
./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst
ENDED(4) AT Thu Nov  9 06:18:20 PM CET 2023 [Status=0]
./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst
ENDED(5) AT Thu Nov  9 06:21:32 PM CET 2023 [Status=0]
---
 .../log_eemumu_mad_d_inl0_hrd0.txt            |  86 +++++++--------
 .../log_eemumu_mad_d_inl0_hrd0_bridge.txt     |  86 +++++++--------
 .../log_eemumu_mad_d_inl0_hrd0_common.txt     |  86 +++++++--------
 .../log_eemumu_mad_d_inl0_hrd0_curhst.txt     |  86 +++++++--------
 .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt     |  86 +++++++--------
 .../log_eemumu_mad_d_inl0_hrd1.txt            |  86 +++++++--------
 .../log_eemumu_mad_d_inl1_hrd0.txt            |  86 +++++++--------
 .../log_eemumu_mad_d_inl1_hrd1.txt            |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd0.txt            |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd0_bridge.txt     |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd0_common.txt     |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd0_curhst.txt     |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt     |  86 +++++++--------
 .../log_eemumu_mad_f_inl0_hrd1.txt            |  86 +++++++--------
 .../log_eemumu_mad_f_inl1_hrd0.txt            |  86 +++++++--------
 .../log_eemumu_mad_f_inl1_hrd1.txt            |  86 +++++++--------
 .../log_eemumu_mad_m_inl0_hrd0.txt            |  86 +++++++--------
 .../log_eemumu_mad_m_inl0_hrd1.txt            |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd0.txt              |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd0_bridge.txt       |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd0_common.txt       |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd0_curhst.txt       |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt       |  86 +++++++--------
 .../log_ggtt_mad_d_inl0_hrd1.txt              |  86 +++++++--------
 .../log_ggtt_mad_d_inl1_hrd0.txt              |  86 +++++++--------
 .../log_ggtt_mad_d_inl1_hrd1.txt              |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd0.txt              |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd0_bridge.txt       |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd0_common.txt       |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd0_curhst.txt       |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt       |  86 +++++++--------
 .../log_ggtt_mad_f_inl0_hrd1.txt              |  86 +++++++--------
 .../log_ggtt_mad_f_inl1_hrd0.txt              |  86 +++++++--------
 .../log_ggtt_mad_f_inl1_hrd1.txt              |  86 +++++++--------
 .../log_ggtt_mad_m_inl0_hrd0.txt              |  86 +++++++--------
 .../log_ggtt_mad_m_inl0_hrd1.txt              |  86 +++++++--------
 .../log_ggttg_mad_d_inl0_hrd0.txt             | 100 +++++++++---------
 .../log_ggttg_mad_d_inl0_hrd0_bridge.txt      | 100 +++++++++---------
 .../log_ggttg_mad_d_inl0_hrd1.txt             | 100 +++++++++---------
 .../log_ggttg_mad_f_inl0_hrd0.txt             |  36 +++----
 .../log_ggttg_mad_f_inl0_hrd0_bridge.txt      |  36 +++----
 .../log_ggttg_mad_f_inl0_hrd1.txt             |  36 +++----
 .../log_ggttg_mad_m_inl0_hrd0.txt             |  36 +++----
 .../log_ggttg_mad_m_inl0_hrd1.txt             |  36 +++----
 .../log_ggttgg_mad_d_inl0_hrd0.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0_common.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd1.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl1_hrd0.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_d_inl1_hrd1.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0_common.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt     | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd1.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl1_hrd0.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_f_inl1_hrd1.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_m_inl0_hrd0.txt            | 100 +++++++++---------
 .../log_ggttgg_mad_m_inl0_hrd1.txt            | 100 +++++++++---------
 .../log_ggttggg_mad_d_inl0_hrd0.txt           | 100 +++++++++---------
 .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt    | 100 +++++++++---------
 .../log_ggttggg_mad_d_inl0_hrd1.txt           | 100 +++++++++---------
 .../log_ggttggg_mad_f_inl0_hrd0.txt           | 100 +++++++++---------
 .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt    | 100 +++++++++---------
 .../log_ggttggg_mad_f_inl0_hrd1.txt           | 100 +++++++++---------
 .../log_ggttggg_mad_m_inl0_hrd0.txt           | 100 +++++++++---------
 .../log_ggttggg_mad_m_inl0_hrd1.txt           | 100 +++++++++---------
 .../log_gqttq_mad_d_inl0_hrd0.txt             | 100 +++++++++---------
 .../log_gqttq_mad_d_inl0_hrd0_bridge.txt      | 100 +++++++++---------
 .../log_gqttq_mad_d_inl0_hrd1.txt             | 100 +++++++++---------
 .../log_gqttq_mad_f_inl0_hrd0.txt             |  92 ++++++++--------
 .../log_gqttq_mad_f_inl0_hrd0_bridge.txt      |  92 ++++++++--------
 .../log_gqttq_mad_f_inl0_hrd1.txt             |  92 ++++++++--------
 .../log_gqttq_mad_m_inl0_hrd0.txt             | 100 +++++++++---------
 .../log_gqttq_mad_m_inl0_hrd1.txt             | 100 +++++++++---------
 78 files changed, 3476 insertions(+), 3476 deletions(-)

diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index 4f18003d70..96be4f25ce 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:15:12
+DATE: 2023-11-09_17:36:35
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.482370e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.785159e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.963951e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.632744e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.846433e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.013402e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.677103 sec
-     2,617,238,862      cycles                           #    2.883 GHz                    
-     4,033,048,225      instructions                     #    1.54  insn per cycle         
-       0.968798898 seconds time elapsed
+TOTAL       :     0.666402 sec
+     2,677,197,972      cycles                           #    3.012 GHz                    
+     4,052,373,824      instructions                     #    1.51  insn per cycle         
+       0.957128261 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.115937e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.309320e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.309320e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.129159e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.324668e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.324668e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.040810 sec
-    18,355,110,031      cycles                           #    3.037 GHz                    
-    44,036,146,715      instructions                     #    2.40  insn per cycle         
-       6.046149721 seconds time elapsed
+TOTAL       :     5.970581 sec
+    18,294,560,469      cycles                           #    3.063 GHz                    
+    44,035,841,714      instructions                     #    2.41  insn per cycle         
+       5.975709847 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.614682e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.109953e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.109953e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.674808e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.201099e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.201099e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.305087 sec
-    12,797,655,048      cycles                           #    2.970 GHz                    
-    31,002,550,325      instructions                     #    2.42  insn per cycle         
-       4.310429047 seconds time elapsed
+TOTAL       :     4.151985 sec
+    12,801,375,184      cycles                           #    3.080 GHz                    
+    31,001,968,290      instructions                     #    2.42  insn per cycle         
+       4.157180427 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.058335e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.864325e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.864325e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.097286e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.929276e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.929276e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.453382 sec
-    10,049,928,632      cycles                           #    2.906 GHz                    
-    19,377,949,384      instructions                     #    1.93  insn per cycle         
-       3.458678566 seconds time elapsed
+TOTAL       :     3.388202 sec
+    10,019,877,774      cycles                           #    2.954 GHz                    
+    19,377,611,613      instructions                     #    1.93  insn per cycle         
+       3.393320382 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.139569e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.018506e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.018506e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.171888e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.054473e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.054473e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.335195 sec
-     9,699,652,158      cycles                           #    2.904 GHz                    
-    18,994,942,569      instructions                     #    1.96  insn per cycle         
-       3.340655484 seconds time elapsed
+TOTAL       :     3.283560 sec
+     9,692,698,438      cycles                           #    2.948 GHz                    
+    19,006,248,514      instructions                     #    1.96  insn per cycle         
+       3.288694745 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.800324e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.389989e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.389989e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.836531e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.447502e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.447502e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.895197 sec
-     8,617,547,988      cycles                           #    2.211 GHz                    
-    15,739,004,417      instructions                     #    1.83  insn per cycle         
-       3.900641958 seconds time elapsed
+TOTAL       :     3.828285 sec
+     8,619,412,035      cycles                           #    2.250 GHz                    
+    15,739,302,747      instructions                     #    1.83  insn per cycle         
+       3.833534805 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
index 60971ecd43..46e9abca4a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:50:38
+DATE: 2023-11-09_18:08:58
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.736559e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.745060e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.745060e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.786999e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.766835e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.766835e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.222962 sec
-     7,400,904,179      cycles                           #    2.991 GHz                    
-    13,138,789,289      instructions                     #    1.78  insn per cycle         
-       2.532867460 seconds time elapsed
+TOTAL       :     2.197852 sec
+     7,407,513,320      cycles                           #    3.040 GHz                    
+    13,213,549,787      instructions                     #    1.78  insn per cycle         
+       2.495471586 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.078362e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.258405e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.258405e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.082808e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.262532e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.262532e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.440995 sec
-    19,547,511,222      cycles                           #    3.033 GHz                    
-    44,263,760,517      instructions                     #    2.26  insn per cycle         
-       6.447379338 seconds time elapsed
+TOTAL       :     6.417727 sec
+    19,594,664,001      cycles                           #    3.052 GHz                    
+    44,265,878,138      instructions                     #    2.26  insn per cycle         
+       6.424119903 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.568240e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.019266e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.019266e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.589377e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.044221e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.044221e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.623039 sec
-    14,052,579,459      cycles                           #    3.037 GHz                    
-    31,844,500,266      instructions                     #    2.27  insn per cycle         
-       4.629479950 seconds time elapsed
+TOTAL       :     4.559857 sec
+    14,005,526,343      cycles                           #    3.068 GHz                    
+    31,844,006,198      instructions                     #    2.27  insn per cycle         
+       4.566322148 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.863308e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.529884e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.529884e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.929770e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.628189e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.628189e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.004138 sec
-    11,314,763,691      cycles                           #    2.822 GHz                    
-    20,739,815,252      instructions                     #    1.83  insn per cycle         
-       4.010963262 seconds time elapsed
+TOTAL       :     3.878054 sec
+    11,287,723,645      cycles                           #    2.906 GHz                    
+    20,738,072,181      instructions                     #    1.84  insn per cycle         
+       3.884538371 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.961498e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.695721e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.695721e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.014169e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.779352e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.779352e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.824549 sec
-    10,997,567,801      cycles                           #    2.871 GHz                    
-    20,355,988,697      instructions                     #    1.85  insn per cycle         
-       3.831152322 seconds time elapsed
+TOTAL       :     3.727856 sec
+    11,041,223,612      cycles                           #    2.958 GHz                    
+    20,355,670,345      instructions                     #    1.84  insn per cycle         
+       3.734291913 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.664769e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.161936e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.161936e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.744355e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.276403e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.276403e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.405341 sec
-     9,931,414,577      cycles                           #    2.252 GHz                    
-    16,884,401,146      instructions                     #    1.70  insn per cycle         
-       4.411803387 seconds time elapsed
+TOTAL       :     4.223001 sec
+     9,961,082,180      cycles                           #    2.356 GHz                    
+    16,884,642,255      instructions                     #    1.70  insn per cycle         
+       4.229415228 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
index 75e14339dc..06dd49c8ef 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_22:03:34
+DATE: 2023-11-09_18:21:50
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.826607e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.612761e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.962341e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.833760e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.622748e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.982780e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     1.301469 sec
-     4,673,993,383      cycles                           #    3.055 GHz                    
-     7,270,667,887      instructions                     #    1.56  insn per cycle         
-       1.586588942 seconds time elapsed
+TOTAL       :     1.311946 sec
+     4,695,073,853      cycles                           #    3.035 GHz                    
+     7,228,449,301      instructions                     #    1.54  insn per cycle         
+       1.606166442 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.143440e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.343019e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.343019e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.133856e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.330921e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.330921e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     6.249393 sec
-    19,374,513,863      cycles                           #    3.098 GHz                    
-    44,137,807,645      instructions                     #    2.28  insn per cycle         
-       6.254447436 seconds time elapsed
+TOTAL       :     6.295519 sec
+    19,403,964,054      cycles                           #    3.081 GHz                    
+    44,141,070,523      instructions                     #    2.27  insn per cycle         
+       6.300790833 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.651049e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.163460e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.163460e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.674176e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.191162e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.191162e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.566003 sec
-    13,842,407,454      cycles                           #    3.029 GHz                    
-    31,004,270,304      instructions                     #    2.24  insn per cycle         
-       4.571383086 seconds time elapsed
+TOTAL       :     4.504649 sec
+    13,863,184,367      cycles                           #    3.075 GHz                    
+    31,003,513,865      instructions                     #    2.24  insn per cycle         
+       4.509943224 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.085679e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.913536e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.913536e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.015608e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.805515e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.805515e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.759234 sec
-    11,164,737,043      cycles                           #    2.967 GHz                    
-    19,280,466,147      instructions                     #    1.73  insn per cycle         
-       3.764531843 seconds time elapsed
+TOTAL       :     3.880062 sec
+    11,162,114,716      cycles                           #    2.882 GHz                    
+    19,285,048,189      instructions                     #    1.73  insn per cycle         
+       3.885435669 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.157188e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.041275e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.041275e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.146900e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.045970e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.045970e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.667981 sec
-    10,833,619,022      cycles                           #    2.950 GHz                    
-    18,695,779,485      instructions                     #    1.73  insn per cycle         
-       3.673091045 seconds time elapsed
+TOTAL       :     3.683003 sec
+    10,893,551,236      cycles                           #    2.955 GHz                    
+    18,696,669,062      instructions                     #    1.72  insn per cycle         
+       3.688290519 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.852503e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.471081e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.471081e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.858668e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.475829e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.475829e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.150740 sec
-     9,740,231,931      cycles                           #    2.344 GHz                    
-    15,438,395,407      instructions                     #    1.59  insn per cycle         
-       4.156220859 seconds time elapsed
+TOTAL       :     4.138576 sec
+     9,729,969,286      cycles                           #    2.349 GHz                    
+    15,438,316,077      instructions                     #    1.59  insn per cycle         
+       4.143776269 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
index c2852b0755..148fb0d2ee 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_22:00:21
+DATE: 2023-11-09_18:18:36
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.830407e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.634363e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.010779e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.853961e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.658990e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.049126e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.985656 sec
-     3,531,228,063      cycles                           #    2.913 GHz                    
-     6,990,251,865      instructions                     #    1.98  insn per cycle         
-       1.270939740 seconds time elapsed
+TOTAL       :     0.956476 sec
+     3,586,792,512      cycles                           #    3.034 GHz                    
+     7,163,432,319      instructions                     #    2.00  insn per cycle         
+       1.241060065 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.143065e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.342569e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.342569e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.134189e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.330626e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.330626e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.897072 sec
-    18,280,833,177      cycles                           #    3.098 GHz                    
-    44,034,372,908      instructions                     #    2.41  insn per cycle         
-       5.902241793 seconds time elapsed
+TOTAL       :     5.945995 sec
+    18,306,649,766      cycles                           #    3.077 GHz                    
+    44,036,304,039      instructions                     #    2.41  insn per cycle         
+       5.951221281 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.647739e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.157991e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.157991e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.656363e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.166761e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.166761e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.221863 sec
-    12,803,042,604      cycles                           #    3.036 GHz                    
-    31,005,296,735      instructions                     #    2.42  insn per cycle         
-       4.227230772 seconds time elapsed
+TOTAL       :     4.200416 sec
+    12,751,192,820      cycles                           #    3.033 GHz                    
+    31,001,487,666      instructions                     #    2.43  insn per cycle         
+       4.205764852 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.083518e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.912332e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.912332e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.102659e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.940126e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.940126e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.412904 sec
-    10,065,358,042      cycles                           #    2.945 GHz                    
-    19,377,556,628      instructions                     #    1.93  insn per cycle         
-       3.418078261 seconds time elapsed
+TOTAL       :     3.381762 sec
+    10,061,410,412      cycles                           #    2.972 GHz                    
+    19,378,394,064      instructions                     #    1.93  insn per cycle         
+       3.387061232 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.178157e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.068476e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.068476e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.165893e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.060121e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.060121e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.275616 sec
-     9,709,500,834      cycles                           #    2.960 GHz                    
-    18,994,586,612      instructions                     #    1.96  insn per cycle         
-       3.280821668 seconds time elapsed
+TOTAL       :     3.294663 sec
+     9,710,957,285      cycles                           #    2.944 GHz                    
+    18,994,988,980      instructions                     #    1.96  insn per cycle         
+       3.300038627 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.874008e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.497430e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.497430e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.865019e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.483923e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.483923e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.750555 sec
-     8,607,389,256      cycles                           #    2.292 GHz                    
-    15,737,632,725      instructions                     #    1.83  insn per cycle         
-       3.755880546 seconds time elapsed
+TOTAL       :     3.767379 sec
+     8,603,525,039      cycles                           #    2.281 GHz                    
+    15,737,455,232      instructions                     #    1.83  insn per cycle         
+       3.772597879 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
index 6a5b6e889f..d2d2949097 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:57:07
+DATE: 2023-11-09_18:15:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.203248e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.569989e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.906875e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.240881e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.587683e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.915014e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.845079 sec
-     6,274,121,781      cycles                           #    3.027 GHz                    
-    11,554,949,617      instructions                     #    1.84  insn per cycle         
-       2.129841068 seconds time elapsed
+TOTAL       :     1.834661 sec
+     6,293,478,609      cycles                           #    3.041 GHz                    
+    11,504,742,224      instructions                     #    1.83  insn per cycle         
+       2.125902004 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.133729e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.330465e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.330465e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.133681e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.328323e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.328323e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.944694 sec
-    18,288,311,212      cycles                           #    3.074 GHz                    
-    44,034,741,687      instructions                     #    2.41  insn per cycle         
-       5.950018785 seconds time elapsed
+TOTAL       :     5.944399 sec
+    18,276,841,424      cycles                           #    3.072 GHz                    
+    44,034,753,944      instructions                     #    2.41  insn per cycle         
+       5.949724506 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  433) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.659128e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.174088e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.174088e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.688289e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.207763e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.207763e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.192977 sec
-    12,790,691,952      cycles                           #    3.048 GHz                    
-    31,002,731,251      instructions                     #    2.42  insn per cycle         
-       4.198334883 seconds time elapsed
+TOTAL       :     4.121368 sec
+    12,748,827,025      cycles                           #    3.090 GHz                    
+    31,001,833,202      instructions                     #    2.43  insn per cycle         
+       4.126844954 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.084534e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.927805e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.927805e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.079781e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.896967e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.896967e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.410971 sec
-    10,102,470,059      cycles                           #    2.959 GHz                    
-    19,378,571,736      instructions                     #    1.92  insn per cycle         
-       3.416356813 seconds time elapsed
+TOTAL       :     3.417656 sec
+    10,039,679,603      cycles                           #    2.934 GHz                    
+    19,377,458,106      instructions                     #    1.93  insn per cycle         
+       3.423002014 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.180416e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.077058e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.077058e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.191213e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.094392e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.094392e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.272531 sec
-     9,723,824,348      cycles                           #    2.967 GHz                    
-    19,005,371,454      instructions                     #    1.95  insn per cycle         
-       3.277801420 seconds time elapsed
+TOTAL       :     3.256603 sec
+     9,688,244,134      cycles                           #    2.971 GHz                    
+    19,005,599,231      instructions                     #    1.96  insn per cycle         
+       3.261875957 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1689) (512y:  181) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.875765e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.503453e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.503453e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.880720e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.508965e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.508965e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.745624 sec
-     8,623,946,797      cycles                           #    2.300 GHz                    
-    15,739,753,667      instructions                     #    1.83  insn per cycle         
-       3.750856873 seconds time elapsed
+TOTAL       :     3.737405 sec
+     8,601,041,918      cycles                           #    2.299 GHz                    
+    15,737,525,138      instructions                     #    1.83  insn per cycle         
+       3.742726567 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  900) (512y:  154) (512z: 1258)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
index 3b69c80285..2943a1e3d5 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:15:46
+DATE: 2023-11-09_17:37:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.519106e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.841619e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.067099e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.636703e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.863019e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.046703e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.661524 sec
-     2,624,385,702      cycles                           #    2.945 GHz                    
-     4,009,504,923      instructions                     #    1.53  insn per cycle         
-       0.953550123 seconds time elapsed
+TOTAL       :     0.654694 sec
+     2,666,558,745      cycles                           #    3.022 GHz                    
+     4,096,338,325      instructions                     #    1.54  insn per cycle         
+       0.944612967 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.178868e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.397031e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.397031e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.202919e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.424199e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.424199e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.737674 sec
-    17,431,892,883      cycles                           #    3.036 GHz                    
-    41,881,565,184      instructions                     #    2.40  insn per cycle         
-       5.743076445 seconds time elapsed
+TOTAL       :     5.624164 sec
+    17,409,154,909      cycles                           #    3.093 GHz                    
+    41,881,099,052      instructions                     #    2.41  insn per cycle         
+       5.629252249 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  392) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.685142e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.222963e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.222963e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.734385e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.287483e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.287483e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.136316 sec
-    12,482,235,541      cycles                           #    3.016 GHz                    
-    30,165,183,766      instructions                     #    2.42  insn per cycle         
-       4.141750487 seconds time elapsed
+TOTAL       :     4.020839 sec
+    12,439,753,645      cycles                           #    3.090 GHz                    
+    30,163,334,779      instructions                     #    2.42  insn per cycle         
+       4.026082449 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1611) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.065221e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.894043e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.894043e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.071596e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.904428e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.904428e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.443708 sec
-     9,960,024,892      cycles                           #    2.889 GHz                    
-    19,109,707,129      instructions                     #    1.92  insn per cycle         
-       3.449179794 seconds time elapsed
+TOTAL       :     3.432943 sec
+     9,954,541,311      cycles                           #    2.896 GHz                    
+    19,109,473,980      instructions                     #    1.92  insn per cycle         
+       3.438069931 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1930) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.139235e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.013091e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.013091e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.172502e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.071351e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.071351e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.337798 sec
-     9,694,110,840      cycles                           #    2.900 GHz                    
-    18,764,903,742      instructions                     #    1.94  insn per cycle         
-       3.343110507 seconds time elapsed
+TOTAL       :     3.287111 sec
+     9,635,946,931      cycles                           #    2.927 GHz                    
+    18,764,577,329      instructions                     #    1.95  insn per cycle         
+       3.292294749 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1661) (512y:  178) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.864706e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.496201e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.496201e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.921117e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.582437e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.582437e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.773287 sec
-     8,448,094,450      cycles                           #    2.236 GHz                    
-    15,614,366,385      instructions                     #    1.85  insn per cycle         
-       3.778658466 seconds time elapsed
+TOTAL       :     3.666524 sec
+     8,448,044,488      cycles                           #    2.302 GHz                    
+    15,613,692,408      instructions                     #    1.85  insn per cycle         
+       3.671704856 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:  886) (512y:  156) (512z: 1239)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
index abd8e16103..e7918e9c23 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:39:41
+DATE: 2023-11-09_17:58:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.541150e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.656561e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.025623e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.801176e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.647831e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.027831e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.677402 sec
-     2,672,042,758      cycles                           #    2.933 GHz                    
-     4,104,960,698      instructions                     #    1.54  insn per cycle         
-       0.969965661 seconds time elapsed
+TOTAL       :     0.681809 sec
+     2,713,657,783      cycles                           #    2.966 GHz                    
+     4,201,847,315      instructions                     #    1.55  insn per cycle         
+       0.974362645 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.643045e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.106548e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.106548e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.699910e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.178375e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.178375e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.237683 sec
-    12,698,973,738      cycles                           #    2.997 GHz                    
-    32,580,365,424      instructions                     #    2.57  insn per cycle         
-       4.243310096 seconds time elapsed
+TOTAL       :     4.094284 sec
+    12,664,884,276      cycles                           #    3.090 GHz                    
+    32,577,115,805      instructions                     #    2.57  insn per cycle         
+       4.099557701 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  296) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.102523e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.004727e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.004727e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.143219e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.065278e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.065278e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.394812 sec
-    10,279,599,861      cycles                           #    3.024 GHz                    
-    24,505,440,482      instructions                     #    2.38  insn per cycle         
-       3.400499086 seconds time elapsed
+TOTAL       :     3.331773 sec
+    10,271,423,521      cycles                           #    3.079 GHz                    
+    24,506,625,447      instructions                     #    2.39  insn per cycle         
+       3.337328311 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1251) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.301834e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.372180e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.372180e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.319805e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.394403e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.394403e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.131325 sec
-     9,114,816,336      cycles                           #    2.906 GHz                    
-    16,941,253,973      instructions                     #    1.86  insn per cycle         
-       3.136898880 seconds time elapsed
+TOTAL       :     3.108988 sec
+     9,122,185,757      cycles                           #    2.931 GHz                    
+    16,942,074,182      instructions                     #    1.86  insn per cycle         
+       3.114300266 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1631) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.334227e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.444641e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.444641e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.263608e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.556489e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.556489e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.093526 sec
-     8,877,539,414      cycles                           #    2.866 GHz                    
-    16,358,190,505      instructions                     #    1.84  insn per cycle         
-       3.099088246 seconds time elapsed
+TOTAL       :     3.169374 sec
+     9,426,858,565      cycles                           #    2.970 GHz                    
+    16,370,203,044      instructions                     #    1.74  insn per cycle         
+       3.174743316 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1370) (512y:  139) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.978126e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.726122e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.726122e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.105750e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.926413e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.926413e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.588578 sec
-     7,927,907,472      cycles                           #    2.207 GHz                    
-    14,594,253,089      instructions                     #    1.84  insn per cycle         
-       3.594362581 seconds time elapsed
+TOTAL       :     3.377253 sec
+     7,897,254,276      cycles                           #    2.335 GHz                    
+    14,592,693,571      instructions                     #    1.85  insn per cycle         
+       3.382567542 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1015) (512y:  158) (512z:  955)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
index d14dcc2cec..676eafadb1 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:40:11
+DATE: 2023-11-09_17:58:39
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.548142e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.673863e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.063444e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.818208e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.668713e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.053456e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.673757 sec
-     2,682,929,459      cycles                           #    2.958 GHz                    
-     4,116,085,529      instructions                     #    1.53  insn per cycle         
-       0.967020710 seconds time elapsed
+TOTAL       :     0.673433 sec
+     2,679,233,058      cycles                           #    2.963 GHz                    
+     4,187,218,910      instructions                     #    1.56  insn per cycle         
+       0.965878825 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.187961e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.086286e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.086286e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.244543e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.167572e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.167572e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.278046 sec
-     9,891,835,516      cycles                           #    3.013 GHz                    
-    25,457,241,379      instructions                     #    2.57  insn per cycle         
-       3.283538395 seconds time elapsed
+TOTAL       :     3.200260 sec
+     9,840,700,159      cycles                           #    3.071 GHz                    
+    25,456,933,061      instructions                     #    2.59  insn per cycle         
+       3.205821754 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  249) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.461475e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.800212e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.800212e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.515705e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.876135e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.876135e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.961448 sec
-     8,958,054,464      cycles                           #    3.020 GHz                    
-    21,514,605,384      instructions                     #    2.40  insn per cycle         
-       2.967091806 seconds time elapsed
+TOTAL       :     2.896836 sec
+     8,925,793,988      cycles                           #    3.076 GHz                    
+    21,514,573,078      instructions                     #    2.41  insn per cycle         
+       2.902177430 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1119) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.449114e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.718886e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.718886e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.506104e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.783990e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.783990e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.969121 sec
-     8,647,101,919      cycles                           #    2.908 GHz                    
-    15,830,093,651      instructions                     #    1.83  insn per cycle         
-       2.974697377 seconds time elapsed
+TOTAL       :     2.900756 sec
+     8,606,887,419      cycles                           #    2.962 GHz                    
+    15,829,788,154      instructions                     #    1.84  insn per cycle         
+       2.906279310 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1494) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.514280e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.825562e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.825562e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.541955e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.879613e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.879613e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.898480 sec
-     8,435,230,503      cycles                           #    2.906 GHz                    
-    15,528,950,884      instructions                     #    1.84  insn per cycle         
-       2.904204103 seconds time elapsed
+TOTAL       :     2.869140 sec
+     8,396,471,591      cycles                           #    2.922 GHz                    
+    15,529,030,850      instructions                     #    1.85  insn per cycle         
+       2.874505432 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1268) (512y:  139) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.166244e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.072345e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.072345e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.119247e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.990719e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.990719e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.304157 sec
-     7,572,571,500      cycles                           #    2.289 GHz                    
-    14,293,792,931      instructions                     #    1.89  insn per cycle         
-       3.309751939 seconds time elapsed
+TOTAL       :     3.376497 sec
+     7,569,554,118      cycles                           #    2.239 GHz                    
+    14,295,014,243      instructions                     #    1.89  insn per cycle         
+       3.381953719 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1041) (512y:  164) (512z:  874)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index cfc01e370f..b0b6c7dbbf 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:16:19
+DATE: 2023-11-09_17:37:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.506984e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.290770e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.275463e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.535063e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.287307e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.259593e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.565965 sec
-     2,321,819,505      cycles                           #    2.946 GHz                    
-     3,610,558,250      instructions                     #    1.56  insn per cycle         
-       0.846354753 seconds time elapsed
+TOTAL       :     0.562225 sec
+     2,332,457,444      cycles                           #    2.979 GHz                    
+     3,625,755,159      instructions                     #    1.55  insn per cycle         
+       0.842176648 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.127208e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.335415e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.335415e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.164715e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.380430e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.380430e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.947738 sec
-    17,831,603,454      cycles                           #    2.997 GHz                    
-    43,615,812,813      instructions                     #    2.45  insn per cycle         
-       5.952849241 seconds time elapsed
+TOTAL       :     5.760009 sec
+    17,802,097,031      cycles                           #    3.089 GHz                    
+    43,613,527,077      instructions                     #    2.45  insn per cycle         
+       5.764750077 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.344868e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.581929e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.581929e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.392272e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.663586e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.663586e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.049781 sec
-     9,255,993,248      cycles                           #    3.030 GHz                    
-    21,926,767,970      instructions                     #    2.37  insn per cycle         
-       3.055067484 seconds time elapsed
+TOTAL       :     2.985891 sec
+     9,233,559,019      cycles                           #    3.088 GHz                    
+    21,925,837,880      instructions                     #    2.37  insn per cycle         
+       2.990875616 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.528612e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.886098e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.886098e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.561578e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.939602e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.939602e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.841538 sec
-     8,310,122,274      cycles                           #    2.920 GHz                    
-    15,590,852,784      instructions                     #    1.88  insn per cycle         
-       2.846613446 seconds time elapsed
+TOTAL       :     2.807792 sec
+     8,302,482,665      cycles                           #    2.952 GHz                    
+    15,590,734,796      instructions                     #    1.88  insn per cycle         
+       2.812825281 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.544975e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.933439e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.933439e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.577370e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.998184e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.998184e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.829740 sec
-     8,228,769,997      cycles                           #    2.904 GHz                    
-    15,439,791,314      instructions                     #    1.88  insn per cycle         
-       2.834839900 seconds time elapsed
+TOTAL       :     2.791624 sec
+     8,243,582,435      cycles                           #    2.950 GHz                    
+    15,435,159,534      instructions                     #    1.87  insn per cycle         
+       2.796691298 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.468064e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.774733e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.774733e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.534202e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.878199e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.878199e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.920266 sec
-     6,654,443,055      cycles                           #    2.276 GHz                    
-    12,870,591,658      instructions                     #    1.93  insn per cycle         
-       2.925460933 seconds time elapsed
+TOTAL       :     2.844579 sec
+     6,638,595,923      cycles                           #    2.339 GHz                    
+    12,873,058,969      instructions                     #    1.94  insn per cycle         
+       2.849721551 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
index b89c0950e0..198199e430 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:51:16
+DATE: 2023-11-09_18:09:35
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.262139e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.843159e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.843159e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.497702e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.965150e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.965150e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.672229 sec
-     5,680,712,756      cycles                           #    2.985 GHz                    
-    10,249,439,391      instructions                     #    1.80  insn per cycle         
-       1.960159582 seconds time elapsed
+TOTAL       :     1.636588 sec
+     5,687,776,927      cycles                           #    3.043 GHz                    
+    10,344,643,155      instructions                     #    1.82  insn per cycle         
+       1.926222709 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.122888e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.326457e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.326457e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.124698e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.329265e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.329265e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.070293 sec
-    18,467,877,178      cycles                           #    3.040 GHz                    
-    43,763,046,084      instructions                     #    2.37  insn per cycle         
-       6.076144883 seconds time elapsed
+TOTAL       :     6.061752 sec
+    18,474,797,660      cycles                           #    3.045 GHz                    
+    43,763,223,756      instructions                     #    2.37  insn per cycle         
+       6.067744277 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.241087e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.353707e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.353707e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.280805e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.418662e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.418662e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.295191 sec
-    10,020,961,358      cycles                           #    3.037 GHz                    
-    23,261,304,628      instructions                     #    2.32  insn per cycle         
-       3.301360149 seconds time elapsed
+TOTAL       :     3.239576 sec
+    10,001,339,639      cycles                           #    3.083 GHz                    
+    23,260,791,069      instructions                     #    2.33  insn per cycle         
+       3.245668541 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.364429e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.552712e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.552712e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.455472e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.697664e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.697664e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.146782 sec
-     9,058,696,000      cycles                           #    2.874 GHz                    
-    16,711,646,468      instructions                     #    1.84  insn per cycle         
-       3.152847146 seconds time elapsed
+TOTAL       :     3.034907 sec
+     9,092,859,245      cycles                           #    2.991 GHz                    
+    16,710,213,462      instructions                     #    1.84  insn per cycle         
+       3.041109346 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.299176e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.448559e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.448559e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.469626e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.746671e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.746671e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.242101 sec
-     8,995,544,368      cycles                           #    2.776 GHz                    
-    16,559,826,795      instructions                     #    1.84  insn per cycle         
-       3.248399630 seconds time elapsed
+TOTAL       :     3.026366 sec
+     9,019,828,246      cycles                           #    2.976 GHz                    
+    16,555,168,621      instructions                     #    1.84  insn per cycle         
+       3.032449491 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.425438e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.624655e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.624655e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.460766e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.686068e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.686068e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     3.082964 sec
-     7,440,102,740      cycles                           #    2.410 GHz                    
-    14,077,595,444      instructions                     #    1.89  insn per cycle         
-       3.089018136 seconds time elapsed
+TOTAL       :     3.037604 sec
+     7,413,210,247      cycles                           #    2.436 GHz                    
+    14,077,138,025      instructions                     #    1.90  insn per cycle         
+       3.043934055 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
index a9a0d75eb2..38db2540d0 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_22:04:10
+DATE: 2023-11-09_18:22:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.383746e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.209904e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.237350e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.382431e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.208254e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.230961e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371863e-02 +- 3.269951e-06 )  GeV^0
-TOTAL       :     1.160269 sec
-     4,203,267,927      cycles                           #    3.027 GHz                    
-     6,686,907,403      instructions                     #    1.59  insn per cycle         
-       1.447760091 seconds time elapsed
+TOTAL       :     1.150438 sec
+     4,093,367,606      cycles                           #    2.986 GHz                    
+     6,655,787,532      instructions                     #    1.63  insn per cycle         
+       1.427536965 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.159461e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.377089e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.377089e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.163257e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.379748e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.379748e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270267e-06 )  GeV^0
-TOTAL       :     6.100201 sec
-    18,832,208,439      cycles                           #    3.085 GHz                    
-    43,796,080,670      instructions                     #    2.33  insn per cycle         
-       6.105246671 seconds time elapsed
+TOTAL       :     6.085036 sec
+    18,810,997,513      cycles                           #    3.089 GHz                    
+    43,795,620,513      instructions                     #    2.33  insn per cycle         
+       6.090075734 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.360687e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.606052e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.606052e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.379076e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.642823e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.642823e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270266e-06 )  GeV^0
-TOTAL       :     3.340864 sec
-    10,252,717,994      cycles                           #    3.065 GHz                    
-    22,009,397,675      instructions                     #    2.15  insn per cycle         
-       3.349625818 seconds time elapsed
+TOTAL       :     3.315467 sec
+    10,223,065,521      cycles                           #    3.080 GHz                    
+    22,006,854,632      instructions                     #    2.15  insn per cycle         
+       3.320462987 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.544336e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.928692e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.928692e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.487454e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.825644e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.825644e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.145870 sec
-     9,340,548,482      cycles                           #    2.966 GHz                    
-    15,504,284,674      instructions                     #    1.66  insn per cycle         
-       3.151101472 seconds time elapsed
+TOTAL       :     3.212098 sec
+     9,324,905,009      cycles                           #    2.900 GHz                    
+    15,502,708,810      instructions                     #    1.66  insn per cycle         
+       3.217273015 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.556429e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.968460e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.968460e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.573485e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.002018e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.002018e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.140902 sec
-     9,274,295,743      cycles                           #    2.952 GHz                    
-    15,151,601,553      instructions                     #    1.63  insn per cycle         
-       3.145942426 seconds time elapsed
+TOTAL       :     3.120613 sec
+     9,288,549,778      cycles                           #    2.973 GHz                    
+    15,149,849,415      instructions                     #    1.63  insn per cycle         
+       3.125542581 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.615564e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.042980e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.042980e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.617810e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.038860e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.038860e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.083778 sec
-     7,670,760,165      cycles                           #    2.484 GHz                    
-    12,580,664,280      instructions                     #    1.64  insn per cycle         
-       3.088953388 seconds time elapsed
+TOTAL       :     3.081671 sec
+     7,641,480,002      cycles                           #    2.476 GHz                    
+    12,579,693,620      instructions                     #    1.65  insn per cycle         
+       3.086750346 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
index e8e5add4c9..6fcc7aa480 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_22:00:55
+DATE: 2023-11-09_18:19:10
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.391545e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.217605e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.255851e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.390821e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.223370e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.268045e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.834564 sec
-     3,199,482,421      cycles                           #    3.039 GHz                    
-     6,490,454,019      instructions                     #    2.03  insn per cycle         
-       1.111753408 seconds time elapsed
+TOTAL       :     0.831823 sec
+     3,198,187,473      cycles                           #    3.040 GHz                    
+     6,464,633,768      instructions                     #    2.02  insn per cycle         
+       1.108743988 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.150543e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.366095e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.366095e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.166393e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.383502e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.383502e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.833467 sec
-    17,826,844,076      cycles                           #    3.054 GHz                    
-    43,615,420,578      instructions                     #    2.45  insn per cycle         
-       5.838895279 seconds time elapsed
+TOTAL       :     5.750668 sec
+    17,811,310,529      cycles                           #    3.095 GHz                    
+    43,613,299,638      instructions                     #    2.45  insn per cycle         
+       5.755604942 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.337314e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.571728e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.571728e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.317079e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.552668e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.552668e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.054886 sec
-     9,243,837,324      cycles                           #    3.022 GHz                    
-    21,925,827,754      instructions                     #    2.37  insn per cycle         
-       3.060063052 seconds time elapsed
+TOTAL       :     3.082399 sec
+     9,236,711,908      cycles                           #    2.992 GHz                    
+    21,926,264,881      instructions                     #    2.37  insn per cycle         
+       3.087937460 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.568595e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.965452e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.965452e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.562942e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.932578e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.932578e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.797209 sec
-     8,337,217,151      cycles                           #    2.976 GHz                    
-    15,590,584,627      instructions                     #    1.87  insn per cycle         
-       2.802297250 seconds time elapsed
+TOTAL       :     2.803489 sec
+     8,311,895,996      cycles                           #    2.960 GHz                    
+    15,590,591,103      instructions                     #    1.88  insn per cycle         
+       2.808434072 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.613887e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.042160e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.042160e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.582757e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.993146e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.993146e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.753850 sec
-     8,236,246,865      cycles                           #    2.988 GHz                    
-    15,440,580,051      instructions                     #    1.87  insn per cycle         
-       2.758988038 seconds time elapsed
+TOTAL       :     2.784903 sec
+     8,236,233,463      cycles                           #    2.953 GHz                    
+    15,439,539,485      instructions                     #    1.87  insn per cycle         
+       2.790025696 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.649804e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.085948e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.085948e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.640868e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.066609e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.066609e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.730536 sec
-     6,628,841,045      cycles                           #    2.424 GHz                    
-    12,869,136,387      instructions                     #    1.94  insn per cycle         
-       2.735524791 seconds time elapsed
+TOTAL       :     2.739279 sec
+     6,618,156,482      cycles                           #    2.412 GHz                    
+    12,869,303,752      instructions                     #    1.94  insn per cycle         
+       2.744541017 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
index 4353a0323c..ef7d7310ec 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:57:41
+DATE: 2023-11-09_18:15:56
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.439872e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.182276e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.152259e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.457534e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.184951e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.150897e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.433462 sec
-     5,039,023,930      cycles                           #    3.052 GHz                    
-     9,234,566,396      instructions                     #    1.83  insn per cycle         
-       1.710073871 seconds time elapsed
+TOTAL       :     1.431692 sec
+     5,029,016,765      cycles                           #    3.047 GHz                    
+     9,191,843,408      instructions                     #    1.83  insn per cycle         
+       1.708626202 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.165155e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.381805e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.381805e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.163854e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.380282e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.380282e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.755055 sec
-    17,830,794,091      cycles                           #    3.096 GHz                    
-    43,613,836,777      instructions                     #    2.45  insn per cycle         
-       5.760227416 seconds time elapsed
+TOTAL       :     5.764377 sec
+    17,805,909,761      cycles                           #    3.087 GHz                    
+    43,613,494,568      instructions                     #    2.45  insn per cycle         
+       5.769597959 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  431) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.340707e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.569922e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.569922e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.391849e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.652855e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.652855e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.052308 sec
-     9,235,069,524      cycles                           #    3.022 GHz                    
-    21,925,950,370      instructions                     #    2.37  insn per cycle         
-       3.057391403 seconds time elapsed
+TOTAL       :     2.987897 sec
+     9,257,292,453      cycles                           #    3.094 GHz                    
+    21,926,827,781      instructions                     #    2.37  insn per cycle         
+       2.993012479 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1936) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.565429e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.942662e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.942662e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.568515e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.950394e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.950394e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.806664 sec
-     8,327,245,678      cycles                           #    2.963 GHz                    
-    15,591,035,358      instructions                     #    1.87  insn per cycle         
-       2.811768123 seconds time elapsed
+TOTAL       :     2.797984 sec
+     8,317,461,722      cycles                           #    2.968 GHz                    
+    15,591,357,650      instructions                     #    1.87  insn per cycle         
+       2.803063629 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.574877e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.971987e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.971987e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.510607e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.874545e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.874545e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.795905 sec
-     8,237,659,186      cycles                           #    2.942 GHz                    
-    15,439,551,856      instructions                     #    1.87  insn per cycle         
-       2.800978610 seconds time elapsed
+TOTAL       :     2.865428 sec
+     8,258,982,824      cycles                           #    2.878 GHz                    
+    15,434,974,292      instructions                     #    1.87  insn per cycle         
+       2.870509731 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2496) (512y:    9) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.627739e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.061419e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.061419e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.534626e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.883996e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.883996e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.749607 sec
-     6,653,390,801      cycles                           #    2.416 GHz                    
-    12,870,556,050      instructions                     #    1.93  insn per cycle         
-       2.754896991 seconds time elapsed
+TOTAL       :     2.847653 sec
+     6,630,370,490      cycles                           #    2.325 GHz                    
+    12,869,864,045      instructions                     #    1.94  insn per cycle         
+       2.852728913 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1735) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
index 4a8bf7a45a..acb88982d2 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:16:49
+DATE: 2023-11-09_17:38:11
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.504004e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.299164e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.301394e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.537187e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.294303e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.293124e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.565713 sec
-     2,319,019,949      cycles                           #    2.949 GHz                    
-     3,628,185,594      instructions                     #    1.56  insn per cycle         
-       0.846311751 seconds time elapsed
+TOTAL       :     0.560273 sec
+     2,360,194,998      cycles                           #    3.018 GHz                    
+     3,675,767,532      instructions                     #    1.56  insn per cycle         
+       0.839402775 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.206183e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.450008e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.450008e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.245068e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.494792e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.494792e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     5.582273 sec
-    16,756,629,307      cycles                           #    2.999 GHz                    
-    41,373,009,702      instructions                     #    2.47  insn per cycle         
-       5.587382956 seconds time elapsed
+TOTAL       :     5.409364 sec
+    16,727,058,520      cycles                           #    3.090 GHz                    
+    41,371,618,921      instructions                     #    2.47  insn per cycle         
+       5.414214747 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.401015e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.738811e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.738811e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.441577e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.817766e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.817766e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.986422 sec
-     9,012,092,925      cycles                           #    3.013 GHz                    
-    21,229,937,185      instructions                     #    2.36  insn per cycle         
-       2.991621252 seconds time elapsed
+TOTAL       :     2.932681 sec
+     9,069,604,999      cycles                           #    3.089 GHz                    
+    21,230,786,011      instructions                     #    2.34  insn per cycle         
+       2.937680542 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1841) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.541320e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.913153e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.913153e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.599334e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.008101e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.008101e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.831515 sec
-     8,274,365,196      cycles                           #    2.917 GHz                    
-    15,424,948,763      instructions                     #    1.86  insn per cycle         
-       2.836960602 seconds time elapsed
+TOTAL       :     2.767998 sec
+     8,243,229,329      cycles                           #    2.973 GHz                    
+    15,424,533,858      instructions                     #    1.87  insn per cycle         
+       2.772999466 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2536) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.599740e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.051139e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.051139e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.643252e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.114551e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.114551e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.773779 sec
-     8,126,258,677      cycles                           #    2.925 GHz                    
-    15,238,451,861      instructions                     #    1.88  insn per cycle         
-       2.778950300 seconds time elapsed
+TOTAL       :     2.727085 sec
+     8,130,917,009      cycles                           #    2.977 GHz                    
+    15,244,999,510      instructions                     #    1.87  insn per cycle         
+       2.732127705 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2423) (512y:    8) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.571238e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.958685e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.958685e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.551006e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.930183e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.930183e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.804796 sec
-     6,629,701,677      cycles                           #    2.360 GHz                    
-    12,848,530,488      instructions                     #    1.94  insn per cycle         
-       2.809910943 seconds time elapsed
+TOTAL       :     2.826162 sec
+     6,610,785,893      cycles                           #    2.336 GHz                    
+    12,848,595,223      instructions                     #    1.94  insn per cycle         
+       2.831354272 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1705) (512y:   18) (512z: 1427)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
index b8155a680e..1f616951f6 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:40:40
+DATE: 2023-11-09_17:59:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.302615e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.188065e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.274309e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.379623e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.224230e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.277206e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.574549 sec
-     2,352,849,250      cycles                           #    2.917 GHz                    
-     3,649,350,219      instructions                     #    1.55  insn per cycle         
-       0.863978578 seconds time elapsed
+TOTAL       :     0.567867 sec
+     2,380,227,304      cycles                           #    3.011 GHz                    
+     3,716,615,660      instructions                     #    1.56  insn per cycle         
+       0.847985852 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.686060e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.194010e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.194010e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.702473e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.225484e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.225484e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     4.097911 sec
-    12,184,788,464      cycles                           #    2.970 GHz                    
-    32,521,623,255      instructions                     #    2.67  insn per cycle         
-       4.103328943 seconds time elapsed
+TOTAL       :     4.053519 sec
+    12,216,293,497      cycles                           #    3.011 GHz                    
+    32,522,254,109      instructions                     #    2.66  insn per cycle         
+       4.058663851 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  312) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.770837e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.689962e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.689962e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.830691e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.806288e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.806288e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.634890 sec
-     7,998,179,733      cycles                           #    3.030 GHz                    
-    18,690,180,922      instructions                     #    2.34  insn per cycle         
-       2.640235037 seconds time elapsed
+TOTAL       :     2.580563 sec
+     7,975,462,428      cycles                           #    3.085 GHz                    
+    18,690,132,924      instructions                     #    2.34  insn per cycle         
+       2.585721810 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1554) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.861879e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.750654e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.750654e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.931453e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.867355e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.867355e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.559375 sec
-     7,467,736,067      cycles                           #    2.913 GHz                    
-    14,255,217,150      instructions                     #    1.91  insn per cycle         
-       2.564904201 seconds time elapsed
+TOTAL       :     2.497040 sec
+     7,461,995,802      cycles                           #    2.983 GHz                    
+    14,254,175,720      instructions                     #    1.91  insn per cycle         
+       2.502220546 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2237) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.908800e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.910304e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.910304e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.990445e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.025789e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.025789e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.522982 sec
-     7,364,286,769      cycles                           #    2.913 GHz                    
-    13,952,625,236      instructions                     #    1.89  insn per cycle         
-       2.528348787 seconds time elapsed
+TOTAL       :     2.453022 sec
+     7,312,763,088      cycles                           #    2.976 GHz                    
+    13,952,233,674      instructions                     #    1.91  insn per cycle         
+       2.458314250 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2096) (512y:    3) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.584257e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.006941e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.006941e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.649642e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.141006e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.141006e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.801165 sec
-     6,529,127,011      cycles                           #    2.327 GHz                    
-    13,421,836,325      instructions                     #    2.06  insn per cycle         
-       2.806446897 seconds time elapsed
+TOTAL       :     2.733236 sec
+     6,541,090,853      cycles                           #    2.390 GHz                    
+    13,422,969,862      instructions                     #    2.05  insn per cycle         
+       2.738380923 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2071) (512y:    1) (512z: 1198)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
index 385ce72d78..374f2a331e 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:41:07
+DATE: 2023-11-09_17:59:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.304320e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.197410e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.300141e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.383788e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.237025e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.315197e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.574067 sec
-     2,385,415,994      cycles                           #    2.943 GHz                    
-     3,655,710,101      instructions                     #    1.53  insn per cycle         
-       0.868231647 seconds time elapsed
+TOTAL       :     0.566447 sec
+     2,356,919,781      cycles                           #    2.991 GHz                    
+     3,683,739,571      instructions                     #    1.56  insn per cycle         
+       0.846741071 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.254695e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.267118e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.267118e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.320968e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.384461e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.384461e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.154781 sec
-     9,423,263,848      cycles                           #    2.983 GHz                    
-    25,307,020,372      instructions                     #    2.69  insn per cycle         
-       3.160042496 seconds time elapsed
+TOTAL       :     3.069124 sec
+     9,404,467,335      cycles                           #    3.060 GHz                    
+    25,307,412,416      instructions                     #    2.69  insn per cycle         
+       3.074433972 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  263) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.134634e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.819272e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.819272e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.164094e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.875777e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.875777e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.372030 sec
-     7,183,608,233      cycles                           #    3.022 GHz                    
-    16,901,599,192      instructions                     #    2.35  insn per cycle         
-       2.377377295 seconds time elapsed
+TOTAL       :     2.347070 sec
+     7,183,873,212      cycles                           #    3.055 GHz                    
+    16,901,716,244      instructions                     #    2.35  insn per cycle         
+       2.352401841 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1359) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.035295e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.215553e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.215553e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.103853e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.343298e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.343298e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.433519 sec
-     7,141,153,744      cycles                           #    2.929 GHz                    
-    13,619,130,373      instructions                     #    1.91  insn per cycle         
-       2.438958453 seconds time elapsed
+TOTAL       :     2.377491 sec
+     7,114,519,285      cycles                           #    2.987 GHz                    
+    13,619,081,744      instructions                     #    1.91  insn per cycle         
+       2.382536600 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2060) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.071324e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.326333e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.326333e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.131276e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.434861e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.434861e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.408738 sec
-     7,063,825,257      cycles                           #    2.927 GHz                    
-    13,435,596,499      instructions                     #    1.90  insn per cycle         
-       2.414135887 seconds time elapsed
+TOTAL       :     2.360462 sec
+     7,057,553,337      cycles                           #    2.985 GHz                    
+    13,435,682,624      instructions                     #    1.90  insn per cycle         
+       2.365710938 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1945) (512y:    4) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.750195e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.390595e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.390595e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.814153e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.521058e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.521058e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.646969 sec
-     6,340,373,316      cycles                           #    2.391 GHz                    
-    13,154,077,274      instructions                     #    2.07  insn per cycle         
-       2.652485679 seconds time elapsed
+TOTAL       :     2.589000 sec
+     6,345,330,255      cycles                           #    2.447 GHz                    
+    13,153,121,215      instructions                     #    2.07  insn per cycle         
+       2.594408710 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2029) (512y:    1) (512z: 1083)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index a176ffc4e4..8dc3126453 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:17:19
+DATE: 2023-11-09_17:38:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.486918e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.802792e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.976330e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.618205e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.831793e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.977288e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.656484 sec
-     2,625,682,009      cycles                           #    2.960 GHz                    
-     4,099,364,380      instructions                     #    1.56  insn per cycle         
-       0.946865269 seconds time elapsed
+TOTAL       :     0.652821 sec
+     2,648,283,165      cycles                           #    3.003 GHz                    
+     4,101,874,172      instructions                     #    1.55  insn per cycle         
+       0.942277172 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.091320e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.274850e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.274850e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.110302e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.297006e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.297006e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.167326 sec
-    18,738,979,619      cycles                           #    3.037 GHz                    
-    44,287,346,211      instructions                     #    2.36  insn per cycle         
-       6.172563885 seconds time elapsed
+TOTAL       :     6.061696 sec
+    18,702,058,697      cycles                           #    3.083 GHz                    
+    44,286,744,373      instructions                     #    2.37  insn per cycle         
+       6.066885580 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  439) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.716365e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.273883e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.273883e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.748205e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.315149e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.315149e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.065766 sec
-    12,369,623,289      cycles                           #    3.039 GHz                    
-    30,960,892,415      instructions                     #    2.50  insn per cycle         
-       4.071137873 seconds time elapsed
+TOTAL       :     3.994898 sec
+    12,345,141,895      cycles                           #    3.087 GHz                    
+    30,960,600,041      instructions                     #    2.51  insn per cycle         
+       4.000031168 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1685) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.040246e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.832671e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.832671e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.024705e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.805066e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.805066e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.479287 sec
-    10,114,657,367      cycles                           #    2.903 GHz                    
-    19,400,067,612      instructions                     #    1.92  insn per cycle         
-       3.484811762 seconds time elapsed
+TOTAL       :     3.505414 sec
+    10,100,327,501      cycles                           #    2.878 GHz                    
+    19,399,870,617      instructions                     #    1.92  insn per cycle         
+       3.510718654 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2146) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.136561e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.021650e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.021650e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.175175e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.066367e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.066367e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.335937 sec
-     9,745,210,637      cycles                           #    2.917 GHz                    
-    18,969,865,366      instructions                     #    1.95  insn per cycle         
-       3.341324685 seconds time elapsed
+TOTAL       :     3.279954 sec
+     9,681,673,426      cycles                           #    2.948 GHz                    
+    18,969,865,921      instructions                     #    1.96  insn per cycle         
+       3.285422855 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1859) (512y:  188) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.846714e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.476604e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.476604e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.948024e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.629123e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.629123e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.810646 sec
-     8,364,453,052      cycles                           #    2.192 GHz                    
-    15,065,277,596      instructions                     #    1.80  insn per cycle         
-       3.816111336 seconds time elapsed
+TOTAL       :     3.620894 sec
+     8,364,739,572      cycles                           #    2.308 GHz                    
+    15,064,814,645      instructions                     #    1.80  insn per cycle         
+       3.626218437 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1023) (512y:  155) (512z: 1316)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
index 257a2b14eb..a2d87f5da8 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2023-11-08_21:17:52
+DATE: 2023-11-09_17:39:14
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.517340e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.835074e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.047913e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.632265e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.861047e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.036900e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.657459 sec
-     2,634,612,924      cycles                           #    2.971 GHz                    
-     4,038,430,114      instructions                     #    1.53  insn per cycle         
-       0.947276631 seconds time elapsed
+TOTAL       :     0.649714 sec
+     2,641,937,888      cycles                           #    3.008 GHz                    
+     4,107,555,428      instructions                     #    1.55  insn per cycle         
+       0.938941535 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.135032e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.337803e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.337803e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.158637e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.370951e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.370951e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     5.948882 sec
-    17,974,083,702      cycles                           #    3.020 GHz                    
-    42,538,758,836      instructions                     #    2.37  insn per cycle         
-       5.954247483 seconds time elapsed
+TOTAL       :     5.823305 sec
+    18,013,373,486      cycles                           #    3.091 GHz                    
+    42,535,982,962      instructions                     #    2.36  insn per cycle         
+       5.828417378 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.746148e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.320939e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.320939e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.770599e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.353490e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.353490e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.005000 sec
-    12,179,888,264      cycles                           #    3.038 GHz                    
-    30,267,022,025      instructions                     #    2.48  insn per cycle         
-       4.010444441 seconds time elapsed
+TOTAL       :     3.950045 sec
+    12,171,205,402      cycles                           #    3.078 GHz                    
+    30,268,628,414      instructions                     #    2.49  insn per cycle         
+       3.955313835 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1692) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.065337e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.877404e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.877404e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.099406e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.925166e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.925166e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.440250 sec
-    10,026,177,275      cycles                           #    2.911 GHz                    
-    19,281,771,933      instructions                     #    1.92  insn per cycle         
-       3.445652030 seconds time elapsed
+TOTAL       :     3.385614 sec
+    10,033,748,773      cycles                           #    2.960 GHz                    
+    19,281,534,051      instructions                     #    1.92  insn per cycle         
+       3.390768328 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2162) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.165158e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.064737e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.064737e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.135260e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.020042e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.020042e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.297369 sec
-     9,639,905,003      cycles                           #    2.920 GHz                    
-    18,781,958,033      instructions                     #    1.95  insn per cycle         
-       3.302769757 seconds time elapsed
+TOTAL       :     3.343797 sec
+     9,615,342,352      cycles                           #    2.872 GHz                    
+    18,771,093,665      instructions                     #    1.95  insn per cycle         
+       3.349067283 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1833) (512y:  191) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.925761e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.602996e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.602996e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.965653e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.666391e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.666391e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.664817 sec
-     8,281,446,223      cycles                           #    2.257 GHz                    
-    14,988,620,827      instructions                     #    1.81  insn per cycle         
-       3.670422107 seconds time elapsed
+TOTAL       :     3.592114 sec
+     8,278,170,966      cycles                           #    2.302 GHz                    
+    14,988,534,751      instructions                     #    1.81  insn per cycle         
+       3.597402233 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1020) (512y:  156) (512z: 1305)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 06ab23436d..dad81481e1 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:18:25
+DATE: 2023-11-09_17:39:47
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.051243e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.169781e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.269231e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.113101e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.178068e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.274620e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.513968 sec
-     2,206,571,631      cycles                           #    2.965 GHz                    
-     3,147,975,302      instructions                     #    1.43  insn per cycle         
-       0.801145911 seconds time elapsed
+TOTAL       :     0.513513 sec
+     2,238,779,994      cycles                           #    3.016 GHz                    
+     3,236,054,047      instructions                     #    1.45  insn per cycle         
+       0.800586540 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.149781e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.212668e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.212668e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.199296e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.263095e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.263095e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.981998 sec
-    15,156,593,836      cycles                           #    3.040 GHz                    
-    38,437,072,823      instructions                     #    2.54  insn per cycle         
-       4.987299145 seconds time elapsed
+TOTAL       :     4.870986 sec
+    15,138,095,755      cycles                           #    3.105 GHz                    
+    38,436,824,615      instructions                     #    2.54  insn per cycle         
+       4.876178872 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.640780e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.838553e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.838553e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.669942e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.869262e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.869262e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.985566 sec
-     9,095,215,674      cycles                           #    3.042 GHz                    
-    24,591,174,592      instructions                     #    2.70  insn per cycle         
-       2.991001875 seconds time elapsed
+TOTAL       :     2.960626 sec
+     9,095,550,717      cycles                           #    3.068 GHz                    
+    24,591,504,229      instructions                     #    2.70  insn per cycle         
+       2.966139239 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.834785e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.339543e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.339543e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.803896e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.327557e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.327557e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.901490 sec
-     5,454,837,265      cycles                           #    2.862 GHz                    
-    11,265,546,477      instructions                     #    2.07  insn per cycle         
-       1.907039068 seconds time elapsed
+TOTAL       :     1.909794 sec
+     5,486,817,505      cycles                           #    2.866 GHz                    
+    11,265,648,347      instructions                     #    2.05  insn per cycle         
+       1.915029323 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.372557e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.993390e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.993390e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.555272e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.195980e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.195980e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.751887 sec
-     4,963,717,675      cycles                           #    2.826 GHz                    
-    10,572,023,161      instructions                     #    2.13  insn per cycle         
-       1.757527600 seconds time elapsed
+TOTAL       :     1.704245 sec
+     4,927,847,485      cycles                           #    2.884 GHz                    
+    10,572,013,859      instructions                     #    2.15  insn per cycle         
+       1.709455619 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.939400e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.168716e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.168716e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.103362e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.341522e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.341522e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.769882 sec
-     5,377,512,872      cycles                           #    1.939 GHz                    
-     7,806,286,911      instructions                     #    1.45  insn per cycle         
-       2.775553290 seconds time elapsed
+TOTAL       :     2.658432 sec
+     5,379,828,238      cycles                           #    2.021 GHz                    
+     7,805,118,346      instructions                     #    1.45  insn per cycle         
+       2.663615123 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
index 8de158cb65..d089f3ea80 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:51:49
+DATE: 2023-11-09_18:10:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.592700e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.008872e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.008872e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.436618e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.989585e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.989585e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.804684 sec
-     3,099,147,756      cycles                           #    2.967 GHz                    
-     4,823,816,385      instructions                     #    1.56  insn per cycle         
-       1.102344703 seconds time elapsed
+TOTAL       :     0.820320 sec
+     3,087,525,024      cycles                           #    2.881 GHz                    
+     4,797,416,225      instructions                     #    1.55  insn per cycle         
+       1.129126082 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.051112e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.111963e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.111963e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.137936e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.202451e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.202451e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.297825 sec
-    15,481,852,434      cycles                           #    2.919 GHz                    
-    38,496,050,546      instructions                     #    2.49  insn per cycle         
-       5.304382607 seconds time elapsed
+TOTAL       :     5.088780 sec
+    15,506,176,025      cycles                           #    3.045 GHz                    
+    38,500,320,484      instructions                     #    2.48  insn per cycle         
+       5.095207532 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.421539e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.610351e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.610351e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.664205e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.863051e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.863051e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.252273 sec
-     9,439,657,096      cycles                           #    2.897 GHz                    
-    24,775,783,847      instructions                     #    2.62  insn per cycle         
-       3.259008663 seconds time elapsed
+TOTAL       :     3.042056 sec
+     9,436,538,509      cycles                           #    3.096 GHz                    
+    24,774,730,249      instructions                     #    2.63  insn per cycle         
+       3.048601444 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.465972e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.935898e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.935898e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.821161e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.311886e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.311886e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.107608 sec
-     5,817,196,530      cycles                           #    2.752 GHz                    
-    11,552,661,145      instructions                     #    1.99  insn per cycle         
-       2.114326410 seconds time elapsed
+TOTAL       :     1.984151 sec
+     5,841,767,961      cycles                           #    2.936 GHz                    
+    11,552,228,699      instructions                     #    1.98  insn per cycle         
+       1.990639911 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.009635e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.580924e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.580924e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.505257e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.122209e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.122209e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.934696 sec
-     5,303,416,333      cycles                           #    2.735 GHz                    
-    10,861,487,391      instructions                     #    2.05  insn per cycle         
-       1.941424882 seconds time elapsed
+TOTAL       :     1.793114 sec
+     5,293,839,115      cycles                           #    2.943 GHz                    
+    10,856,913,242      instructions                     #    2.05  insn per cycle         
+       1.799607546 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.701730e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.912869e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.912869e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.021313e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.250852e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.250852e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.025583 sec
-     5,727,782,590      cycles                           #    1.894 GHz                    
-     8,052,158,492      instructions                     #    1.41  insn per cycle         
-       3.032424174 seconds time elapsed
+TOTAL       :     2.791071 sec
+     5,762,529,693      cycles                           #    2.060 GHz                    
+     8,048,857,986      instructions                     #    1.40  insn per cycle         
+       2.797719094 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
index fc433be1ef..d4092f872a 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_22:04:42
+DATE: 2023-11-09_18:22:57
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.726172e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.159376e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270269e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.736311e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.160845e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.271332e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     0.626000 sec
-     2,413,951,090      cycles                           #    2.822 GHz                    
-     3,508,959,445      instructions                     #    1.45  insn per cycle         
-       0.913280230 seconds time elapsed
+TOTAL       :     0.616077 sec
+     2,487,675,163      cycles                           #    2.949 GHz                    
+     3,609,155,412      instructions                     #    1.45  insn per cycle         
+       0.900867999 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.182990e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.247369e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.247369e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.176864e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.240941e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.240941e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     4.967265 sec
-    15,332,653,861      cycles                           #    3.084 GHz                    
-    38,452,810,595      instructions                     #    2.51  insn per cycle         
-       4.972510854 seconds time elapsed
+TOTAL       :     4.979715 sec
+    15,323,819,271      cycles                           #    3.075 GHz                    
+    38,452,992,607      instructions                     #    2.51  insn per cycle         
+       4.984901972 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.695457e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.898409e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.898409e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.677729e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.878488e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.878488e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.999548 sec
-     9,281,583,975      cycles                           #    3.090 GHz                    
-    24,591,762,393      instructions                     #    2.65  insn per cycle         
-       3.004985897 seconds time elapsed
+TOTAL       :     3.013873 sec
+     9,290,869,776      cycles                           #    3.079 GHz                    
+    24,592,367,735      instructions                     #    2.65  insn per cycle         
+       3.019043179 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.871319e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.385365e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.385365e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.850559e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.370312e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.370312e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     1.950157 sec
-     5,690,984,261      cycles                           #    2.911 GHz                    
-    11,247,762,981      instructions                     #    1.98  insn per cycle         
-       1.955461495 seconds time elapsed
+TOTAL       :     1.954495 sec
+     5,685,208,050      cycles                           #    2.902 GHz                    
+    11,247,975,749      instructions                     #    1.98  insn per cycle         
+       1.959795584 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.503413e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.137413e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.137413e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.607127e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.248201e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.248201e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     1.776614 sec
-     5,148,876,403      cycles                           #    2.891 GHz                    
-    10,521,901,939      instructions                     #    2.04  insn per cycle         
-       1.781976606 seconds time elapsed
+TOTAL       :     1.748348 sec
+     5,124,696,849      cycles                           #    2.923 GHz                    
+    10,520,869,381      instructions                     #    2.05  insn per cycle         
+       1.753705732 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.075607e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.312212e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.312212e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.874391e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.086224e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.086224e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.736817 sec
-     5,563,466,882      cycles                           #    2.030 GHz                    
-     7,754,129,949      instructions                     #    1.39  insn per cycle         
-       2.742022793 seconds time elapsed
+TOTAL       :     2.872375 sec
+     5,588,777,867      cycles                           #    1.950 GHz                    
+     7,758,258,898      instructions                     #    1.39  insn per cycle         
+       2.877703247 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
index f949e08a8e..b9b046957a 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_22:01:25
+DATE: 2023-11-09_18:19:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.746837e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.161251e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.269946e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.737213e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.157401e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.270983e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.546588 sec
-     2,339,106,527      cycles                           #    3.024 GHz                    
-     3,639,530,742      instructions                     #    1.56  insn per cycle         
-       0.830477401 seconds time elapsed
+TOTAL       :     0.551870 sec
+     2,343,082,954      cycles                           #    3.005 GHz                    
+     3,662,705,915      instructions                     #    1.56  insn per cycle         
+       0.837059271 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.194821e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.259419e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.259419e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.189334e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.253549e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.253549e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.881299 sec
-    15,162,215,504      cycles                           #    3.104 GHz                    
-    38,436,564,546      instructions                     #    2.54  insn per cycle         
-       4.886593937 seconds time elapsed
+TOTAL       :     4.893645 sec
+    15,145,823,463      cycles                           #    3.092 GHz                    
+    38,436,891,323      instructions                     #    2.54  insn per cycle         
+       4.899128465 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.717533e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.921164e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.921164e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.701689e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.903671e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.903671e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.924290 sec
-     9,098,563,572      cycles                           #    3.107 GHz                    
-    24,592,229,111      instructions                     #    2.70  insn per cycle         
-       2.929612410 seconds time elapsed
+TOTAL       :     2.937292 sec
+     9,090,406,845      cycles                           #    3.091 GHz                    
+    24,590,949,325      instructions                     #    2.71  insn per cycle         
+       2.942627315 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.896966e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.423160e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.423160e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.932093e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.448459e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.448459e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.883509 sec
-     5,473,701,924      cycles                           #    2.899 GHz                    
-    11,265,098,305      instructions                     #    2.06  insn per cycle         
-       1.888826353 seconds time elapsed
+TOTAL       :     1.870782 sec
+     5,477,596,736      cycles                           #    2.921 GHz                    
+    11,265,174,730      instructions                     #    2.06  insn per cycle         
+       1.876089705 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.333944e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.936194e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.936194e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.470328e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.111006e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.111006e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.759678 sec
-     4,959,739,230      cycles                           #    2.811 GHz                    
-    10,570,009,461      instructions                     #    2.13  insn per cycle         
-       1.765083600 seconds time elapsed
+TOTAL       :     1.725707 sec
+     4,951,306,612      cycles                           #    2.866 GHz                    
+    10,571,555,034      instructions                     #    2.14  insn per cycle         
+       1.731137280 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.108089e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.344532e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.344532e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.944088e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.162238e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.162238e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.655128 sec
-     5,388,561,520      cycles                           #    2.026 GHz                    
-     7,804,959,196      instructions                     #    1.45  insn per cycle         
-       2.660471194 seconds time elapsed
+TOTAL       :     2.762921 sec
+     5,392,276,499      cycles                           #    1.949 GHz                    
+     7,806,030,768      instructions                     #    1.45  insn per cycle         
+       2.768347372 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
index 6c72f6887e..655f8b81f2 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:58:12
+DATE: 2023-11-09_18:16:27
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.993868e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.158186e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.266776e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.038584e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.158740e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.268341e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.697399 sec
-     2,787,983,765      cycles                           #    3.019 GHz                    
-     4,369,945,413      instructions                     #    1.57  insn per cycle         
-       0.982292174 seconds time elapsed
+TOTAL       :     0.696596 sec
+     2,778,143,738      cycles                           #    3.016 GHz                    
+     4,350,451,856      instructions                     #    1.57  insn per cycle         
+       0.980250782 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.151791e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.213814e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.213814e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.182446e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.245839e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.245839e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.977336 sec
-    15,184,395,969      cycles                           #    3.048 GHz                    
-    38,438,963,256      instructions                     #    2.53  insn per cycle         
-       4.982648512 seconds time elapsed
+TOTAL       :     4.909547 sec
+    15,150,996,904      cycles                           #    3.083 GHz                    
+    38,436,637,567      instructions                     #    2.54  insn per cycle         
+       4.914838193 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.705404e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.908313e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.908313e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.688279e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.888858e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.888858e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.933004 sec
-     9,125,855,621      cycles                           #    3.107 GHz                    
-    24,590,801,711      instructions                     #    2.69  insn per cycle         
-       2.938291037 seconds time elapsed
+TOTAL       :     2.947447 sec
+     9,111,190,675      cycles                           #    3.087 GHz                    
+    24,590,939,294      instructions                     #    2.70  insn per cycle         
+       2.952793630 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2156) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.720849e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.210353e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.210353e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.931624e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.461725e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.461725e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.938623 sec
-     5,466,827,554      cycles                           #    2.814 GHz                    
-    11,265,438,862      instructions                     #    2.06  insn per cycle         
-       1.943823759 seconds time elapsed
+TOTAL       :     1.871366 sec
+     5,440,450,573      cycles                           #    2.900 GHz                    
+    11,265,206,629      instructions                     #    2.07  insn per cycle         
+       1.876659163 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2376) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.635980e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.287954e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.287954e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.623582e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.268733e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.268733e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.682323 sec
-     4,955,566,146      cycles                           #    2.937 GHz                    
-    10,571,524,775      instructions                     #    2.13  insn per cycle         
-       1.687724736 seconds time elapsed
+TOTAL       :     1.687174 sec
+     4,939,929,910      cycles                           #    2.920 GHz                    
+    10,570,291,125      instructions                     #    2.14  insn per cycle         
+       1.692619999 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2077) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.091835e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.326386e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.326386e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.058474e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.295667e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.295667e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.665788 sec
-     5,400,449,096      cycles                           #    2.023 GHz                    
-     7,805,014,579      instructions                     #    1.45  insn per cycle         
-       2.671129758 seconds time elapsed
+TOTAL       :     2.687752 sec
+     5,409,737,421      cycles                           #    2.010 GHz                    
+     7,805,529,138      instructions                     #    1.44  insn per cycle         
+       2.693129228 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1446) (512y:  122) (512z: 1542)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
index 3a0f520dcc..e703e9e5d5 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:18:52
+DATE: 2023-11-09_17:40:13
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.048585e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.168286e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.265645e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.110180e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.174406e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.270579e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.515938 sec
-     2,194,564,244      cycles                           #    2.948 GHz                    
-     3,170,767,882      instructions                     #    1.44  insn per cycle         
-       0.803319972 seconds time elapsed
+TOTAL       :     0.513301 sec
+     2,237,705,656      cycles                           #    3.016 GHz                    
+     3,206,861,926      instructions                     #    1.43  insn per cycle         
+       0.799816578 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.145803e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.208726e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.208726e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.213184e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.278334e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.278334e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.991935 sec
-    15,019,527,641      cycles                           #    3.006 GHz                    
-    40,165,389,576      instructions                     #    2.67  insn per cycle         
-       4.997467241 seconds time elapsed
+TOTAL       :     4.840849 sec
+    15,026,294,462      cycles                           #    3.101 GHz                    
+    40,163,846,165      instructions                     #    2.67  insn per cycle         
+       4.846092672 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.795270e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.015877e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.015877e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.848578e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.068499e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.068499e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.867596 sec
-     8,671,075,725      cycles                           #    3.019 GHz                    
-    23,683,669,849      instructions                     #    2.73  insn per cycle         
-       2.873212548 seconds time elapsed
+TOTAL       :     2.827567 sec
+     8,771,607,406      cycles                           #    3.097 GHz                    
+    23,683,918,687      instructions                     #    2.70  insn per cycle         
+       2.832818835 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2069) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.180539e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.583447e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.583447e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.290749e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.696907e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.696907e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.128793 sec
-     6,072,650,571      cycles                           #    2.846 GHz                    
-    13,074,915,373      instructions                     #    2.15  insn per cycle         
-       2.134316674 seconds time elapsed
+TOTAL       :     2.084491 sec
+     6,075,216,707      cycles                           #    2.908 GHz                    
+    13,074,699,153      instructions                     #    2.15  insn per cycle         
+       2.089762357 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2546) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.449593e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.890564e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.890564e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.571274e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.025491e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.025491e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.028925 sec
-     5,794,294,617      cycles                           #    2.851 GHz                    
-    12,335,132,296      instructions                     #    2.13  insn per cycle         
-       2.034385767 seconds time elapsed
+TOTAL       :     1.983621 sec
+     5,795,280,725      cycles                           #    2.915 GHz                    
+    12,334,890,295      instructions                     #    2.13  insn per cycle         
+       1.988789955 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2096) (512y:  294) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.645486e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.838740e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.838740e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.706784e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.899846e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.899846e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.982084 sec
-     5,814,493,383      cycles                           #    1.947 GHz                    
-     9,613,724,456      instructions                     #    1.65  insn per cycle         
-       2.987600867 seconds time elapsed
+TOTAL       :     2.932528 sec
+     5,816,798,800      cycles                           #    1.981 GHz                    
+     9,613,398,484      instructions                     #    1.65  insn per cycle         
+       2.938057800 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1510) (512y:  209) (512z: 1971)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
index 1cbf67a236..a5c5a0c704 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:41:33
+DATE: 2023-11-09_17:59:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.595048e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.160670e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.269203e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.735374e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.165776e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.275136e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.521954 sec
-     2,216,810,301      cycles                           #    2.935 GHz                    
-     3,140,499,783      instructions                     #    1.42  insn per cycle         
-       0.812101303 seconds time elapsed
+TOTAL       :     0.522181 sec
+     2,183,845,501      cycles                           #    2.897 GHz                    
+     3,063,497,760      instructions                     #    1.40  insn per cycle         
+       0.813008083 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.505174e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.591402e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.591402e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.487456e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.573222e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.573222e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.291146 sec
-    13,017,199,090      cycles                           #    3.030 GHz                    
-    34,406,598,887      instructions                     #    2.64  insn per cycle         
-       4.296733375 seconds time elapsed
+TOTAL       :     4.321242 sec
+    13,015,032,492      cycles                           #    3.009 GHz                    
+    34,406,787,342      instructions                     #    2.64  insn per cycle         
+       4.326519493 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  686) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.106755e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.249963e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.249963e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.121956e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.266333e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.266333e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.481603 sec
-    10,608,834,284      cycles                           #    3.044 GHz                    
-    24,023,421,035      instructions                     #    2.26  insn per cycle         
-       3.487384559 seconds time elapsed
+TOTAL       :     3.465165 sec
+    10,606,115,107      cycles                           #    3.057 GHz                    
+    24,023,886,202      instructions                     #    2.27  insn per cycle         
+       3.470527002 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2582) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.756679e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.089717e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.089717e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.813993e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.151107e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.151107e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.309669 sec
-     6,605,241,660      cycles                           #    2.854 GHz                    
-    12,414,642,119      instructions                     #    1.88  insn per cycle         
-       2.315374830 seconds time elapsed
+TOTAL       :     2.282824 sec
+     6,624,207,523      cycles                           #    2.896 GHz                    
+    12,414,593,585      instructions                     #    1.87  insn per cycle         
+       2.288220203 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3156) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.883072e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.243446e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.243446e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.113256e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.489865e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.489865e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.253913 sec
-     6,256,146,881      cycles                           #    2.770 GHz                    
-    11,588,754,266      instructions                     #    1.85  insn per cycle         
-       2.259602028 seconds time elapsed
+TOTAL       :     2.154567 sec
+     6,244,302,737      cycles                           #    2.892 GHz                    
+    11,586,784,905      instructions                     #    1.86  insn per cycle         
+       2.160119888 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2692) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.014282e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.246391e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.246391e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.080168e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.315597e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.315597e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.718420 sec
-     5,340,176,505      cycles                           #    1.961 GHz                    
-     9,309,276,244      instructions                     #    1.74  insn per cycle         
-       2.724177871 seconds time elapsed
+TOTAL       :     2.674256 sec
+     5,337,021,373      cycles                           #    1.992 GHz                    
+     9,309,292,596      instructions                     #    1.74  insn per cycle         
+       2.679621915 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2116) (512y:  282) (512z: 1958)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
index 086ff92179..04c22c3970 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:42:00
+DATE: 2023-11-09_18:00:26
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.601958e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.157408e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.268312e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.730812e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.162658e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.271522e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.523179 sec
-     2,197,044,574      cycles                           #    2.904 GHz                    
-     3,180,010,549      instructions                     #    1.45  insn per cycle         
-       0.813333970 seconds time elapsed
+TOTAL       :     0.517390 sec
+     2,237,231,843      cycles                           #    2.985 GHz                    
+     3,219,482,821      instructions                     #    1.44  insn per cycle         
+       0.806478536 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
@@ -77,14 +77,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.551621e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.643503e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.643503e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.686328e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.783817e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.783817e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.216286 sec
-    12,375,189,012      cycles                           #    2.932 GHz                    
-    35,060,083,206      instructions                     #    2.83  insn per cycle         
-       4.222169031 seconds time elapsed
+TOTAL       :     4.006446 sec
+    12,372,456,833      cycles                           #    3.085 GHz                    
+    35,059,205,099      instructions                     #    2.83  insn per cycle         
+       4.011874603 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  457) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.067813e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.209694e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.209694e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.113185e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.255090e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.255090e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.525507 sec
-    10,698,056,208      cycles                           #    3.031 GHz                    
-    23,100,081,560      instructions                     #    2.16  insn per cycle         
-       3.531306963 seconds time elapsed
+TOTAL       :     3.471336 sec
+    10,684,507,667      cycles                           #    3.074 GHz                    
+    23,099,965,959      instructions                     #    2.16  insn per cycle         
+       3.476724591 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2363) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.118146e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.507530e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.507530e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.172732e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.564192e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.564192e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.154521 sec
-     6,166,402,806      cycles                           #    2.856 GHz                    
-    11,969,983,926      instructions                     #    1.94  insn per cycle         
-       2.160177772 seconds time elapsed
+TOTAL       :     2.130496 sec
+     6,169,121,187      cycles                           #    2.891 GHz                    
+    11,970,628,399      instructions                     #    1.94  insn per cycle         
+       2.136000238 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2511) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.238236e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.649069e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.649069e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.314737e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.728928e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.728928e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.108281 sec
-     6,026,300,401      cycles                           #    2.854 GHz                    
-    11,141,738,024      instructions                     #    1.85  insn per cycle         
-       2.114031870 seconds time elapsed
+TOTAL       :     2.076859 sec
+     6,006,071,025      cycles                           #    2.885 GHz                    
+    11,143,550,799      instructions                     #    1.86  insn per cycle         
+       2.082481137 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2128) (512y:  174) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.978977e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.208595e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.208595e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.186490e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.434908e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.434908e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.742076 sec
-     5,240,960,370      cycles                           #    1.908 GHz                    
-     9,033,887,762      instructions                     #    1.72  insn per cycle         
-       2.747795404 seconds time elapsed
+TOTAL       :     2.608202 sec
+     5,201,388,823      cycles                           #    1.991 GHz                    
+     9,034,449,537      instructions                     #    1.74  insn per cycle         
+       2.613510222 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1651) (512y:  208) (512z: 1567)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index eb4d5419ee..b055a915bb 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:19:21
+DATE: 2023-11-09_17:40:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.037656e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.679710e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.950060e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.058988e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.701786e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.976764e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.474624 sec
-     2,093,800,407      cycles                           #    2.948 GHz                    
-     2,971,543,250      instructions                     #    1.42  insn per cycle         
-       0.767958808 seconds time elapsed
+TOTAL       :     0.470897 sec
+     2,078,401,117      cycles                           #    3.001 GHz                    
+     2,953,650,991      instructions                     #    1.42  insn per cycle         
+       0.749721776 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.294584e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.370694e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.370694e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.334914e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.410542e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.410542e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.654140 sec
-    14,153,083,054      cycles                           #    3.038 GHz                    
-    38,392,852,878      instructions                     #    2.71  insn per cycle         
-       4.659227784 seconds time elapsed
+TOTAL       :     4.574332 sec
+    14,151,959,917      cycles                           #    3.091 GHz                    
+    38,392,913,322      instructions                     #    2.71  insn per cycle         
+       4.579307325 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.142013e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.564188e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.564188e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.213719e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.641599e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.641599e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.123842 sec
-     6,471,678,330      cycles                           #    3.041 GHz                    
-    15,829,749,383      instructions                     #    2.45  insn per cycle         
-       2.129132115 seconds time elapsed
+TOTAL       :     2.094684 sec
+     6,471,158,629      cycles                           #    3.083 GHz                    
+    15,829,971,957      instructions                     #    2.45  insn per cycle         
+       2.099849038 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.403745e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.082517e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.082517e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.559598e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.101002e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.101002e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.198427 sec
-     3,459,269,129      cycles                           #    2.876 GHz                    
-     7,606,844,485      instructions                     #    2.20  insn per cycle         
-       1.203597878 seconds time elapsed
+TOTAL       :     1.179944 sec
+     3,466,899,201      cycles                           #    2.927 GHz                    
+     7,607,183,710      instructions                     #    2.19  insn per cycle         
+       1.185084453 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.005658e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.168806e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.168806e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.023293e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.190211e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.190211e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.126360 sec
-     3,254,355,778      cycles                           #    2.878 GHz                    
-     7,215,715,994      instructions                     #    2.22  insn per cycle         
-       1.131662200 seconds time elapsed
+TOTAL       :     1.106259 sec
+     3,248,324,558      cycles                           #    2.924 GHz                    
+     7,215,751,749      instructions                     #    2.22  insn per cycle         
+       1.111467205 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.276060e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.101034e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.101034e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.338108e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.142577e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.142577e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.528725 sec
-     3,068,447,705      cycles                           #    2.001 GHz                    
-     5,846,027,778      instructions                     #    1.91  insn per cycle         
-       1.534029615 seconds time elapsed
+TOTAL       :     1.512902 sec
+     3,068,145,100      cycles                           #    2.024 GHz                    
+     5,846,808,445      instructions                     #    1.91  insn per cycle         
+       1.518114660 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
index 459315b5db..b4b4f0117a 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:52:18
+DATE: 2023-11-09_18:10:35
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.229057e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.759945e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.759945e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.332495e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.768677e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.768677e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.663839 sec
-     2,633,797,388      cycles                           #    2.963 GHz                    
-     4,071,573,226      instructions                     #    1.55  insn per cycle         
-       0.947283739 seconds time elapsed
+TOTAL       :     0.657541 sec
+     2,664,976,053      cycles                           #    3.017 GHz                    
+     4,137,029,639      instructions                     #    1.55  insn per cycle         
+       0.940709573 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -86,14 +86,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.280486e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.353996e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.353996e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.284632e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.359311e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.359311e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.724775 sec
-    14,342,143,211      cycles                           #    3.033 GHz                    
-    38,438,250,053      instructions                     #    2.68  insn per cycle         
-       4.731136861 seconds time elapsed
+TOTAL       :     4.717410 sec
+    14,339,509,352      cycles                           #    3.036 GHz                    
+    38,436,261,270      instructions                     #    2.68  insn per cycle         
+       4.723588153 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -114,14 +114,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.072115e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.484269e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.484269e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.161401e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.579571e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.579571e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.197377 sec
-     6,673,460,854      cycles                           #    3.029 GHz                    
-    16,110,044,412      instructions                     #    2.41  insn per cycle         
-       2.203637127 seconds time elapsed
+TOTAL       :     2.160392 sec
+     6,674,034,151      cycles                           #    3.082 GHz                    
+    16,110,239,223      instructions                     #    2.41  insn per cycle         
+       2.166483007 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -142,14 +142,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.156025e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.050843e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.050843e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.368587e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.075649e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.075649e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.276703 sec
-     3,679,224,682      cycles                           #    2.872 GHz                    
-     7,844,733,298      instructions                     #    2.13  insn per cycle         
-       1.282950304 seconds time elapsed
+TOTAL       :     1.245937 sec
+     3,665,898,836      cycles                           #    2.929 GHz                    
+     7,844,268,726      instructions                     #    2.14  insn per cycle         
+       1.252070096 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -170,14 +170,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.848037e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.141843e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.141843e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.007320e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.169448e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.169448e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.194194 sec
-     3,452,479,238      cycles                           #    2.878 GHz                    
-     7,452,050,539      instructions                     #    2.16  insn per cycle         
-       1.200346156 seconds time elapsed
+TOTAL       :     1.168797 sec
+     3,453,510,139      cycles                           #    2.941 GHz                    
+     7,453,168,499      instructions                     #    2.16  insn per cycle         
+       1.174935345 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -198,14 +198,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.221197e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.012402e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.012402e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.465484e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.304262e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.304262e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.583142 sec
-     3,273,382,507      cycles                           #    2.061 GHz                    
-     6,100,795,667      instructions                     #    1.86  insn per cycle         
-       1.589319377 seconds time elapsed
+TOTAL       :     1.534111 sec
+     3,274,248,388      cycles                           #    2.127 GHz                    
+     6,100,577,921      instructions                     #    1.86  insn per cycle         
+       1.540213764 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
index dcdda81950..375a817a79 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_22:05:09
+DATE: 2023-11-09_18:23:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.826188e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.648877e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.951378e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.824516e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.637814e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.946525e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079446e+00 +- 3.403306e-03 )  GeV^0
-TOTAL       :     0.557947 sec
-     2,332,705,336      cycles                           #    3.000 GHz                    
-     3,420,801,676      instructions                     #    1.47  insn per cycle         
-       0.836912289 seconds time elapsed
+TOTAL       :     0.570368 sec
+     2,261,358,530      cycles                           #    2.855 GHz                    
+     3,305,358,456      instructions                     #    1.46  insn per cycle         
+       0.849017060 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.339471e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.416548e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.416548e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.325738e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.401234e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.401234e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079573e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     4.618720 sec
-    14,313,897,069      cycles                           #    3.097 GHz                    
-    38,421,663,028      instructions                     #    2.68  insn per cycle         
-       4.623775275 seconds time elapsed
+TOTAL       :     4.645582 sec
+    14,325,809,375      cycles                           #    3.082 GHz                    
+    38,422,987,894      instructions                     #    2.68  insn per cycle         
+       4.650648560 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.232630e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.661001e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.661001e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.201320e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.630090e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.630090e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079572e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     2.140530 sec
-     6,636,885,571      cycles                           #    3.094 GHz                    
-    15,842,171,589      instructions                     #    2.39  insn per cycle         
-       2.145594820 seconds time elapsed
+TOTAL       :     2.152860 sec
+     6,643,060,083      cycles                           #    3.080 GHz                    
+    15,842,584,477      instructions                     #    2.38  insn per cycle         
+       2.158023571 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.545031e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.097804e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.097804e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.450401e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.089441e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.089441e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.233588 sec
-     3,635,079,459      cycles                           #    2.936 GHz                    
-     7,590,685,166      instructions                     #    2.09  insn per cycle         
-       1.238746125 seconds time elapsed
+TOTAL       :     1.246774 sec
+     3,643,683,352      cycles                           #    2.913 GHz                    
+     7,592,040,005      instructions                     #    2.08  insn per cycle         
+       1.251723719 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.024875e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.195413e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.195413e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.014057e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.180349e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.180349e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.160670 sec
-     3,429,453,475      cycles                           #    2.944 GHz                    
-     7,166,679,947      instructions                     #    2.09  insn per cycle         
-       1.165684786 seconds time elapsed
+TOTAL       :     1.172023 sec
+     3,431,252,645      cycles                           #    2.917 GHz                    
+     7,165,511,136      instructions                     #    2.09  insn per cycle         
+       1.177142051 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.262300e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.049639e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.049639e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.431571e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.259454e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.259454e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404208e-03 )  GeV^0
-TOTAL       :     1.582365 sec
-     3,235,924,413      cycles                           #    2.039 GHz                    
-     5,796,611,749      instructions                     #    1.79  insn per cycle         
-       1.587507042 seconds time elapsed
+TOTAL       :     1.550630 sec
+     3,238,644,111      cycles                           #    2.083 GHz                    
+     5,796,702,494      instructions                     #    1.79  insn per cycle         
+       1.555869344 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
index 831fd0fa9f..573aa8a1a6 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_22:01:51
+DATE: 2023-11-09_18:20:06
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.837632e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.654775e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.958238e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.875401e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.666103e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.969743e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.503341 sec
-     2,173,332,424      cycles                           #    3.019 GHz                    
-     3,385,289,251      instructions                     #    1.56  insn per cycle         
-       0.779359289 seconds time elapsed
+TOTAL       :     0.505509 sec
+     2,155,710,329      cycles                           #    2.977 GHz                    
+     3,399,528,814      instructions                     #    1.58  insn per cycle         
+       0.781726612 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.329232e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.405368e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.405368e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.331654e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.407717e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.407717e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.586570 sec
-    14,159,897,717      cycles                           #    3.085 GHz                    
-    38,395,355,740      instructions                     #    2.71  insn per cycle         
-       4.591702989 seconds time elapsed
+TOTAL       :     4.581584 sec
+    14,155,354,915      cycles                           #    3.087 GHz                    
+    38,394,211,404      instructions                     #    2.71  insn per cycle         
+       4.586893992 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.170239e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.592491e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.592491e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.232934e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.666455e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.666455e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.112173 sec
-     6,472,075,786      cycles                           #    3.058 GHz                    
-    15,829,638,315      instructions                     #    2.45  insn per cycle         
-       2.117221818 seconds time elapsed
+TOTAL       :     2.087317 sec
+     6,475,857,503      cycles                           #    3.096 GHz                    
+    15,829,568,301      instructions                     #    2.44  insn per cycle         
+       2.092497637 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.605537e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.104706e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.104706e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.589829e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.103388e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.103388e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.174316 sec
-     3,462,364,333      cycles                           #    2.937 GHz                    
-     7,606,467,395      instructions                     #    2.20  insn per cycle         
-       1.179522425 seconds time elapsed
+TOTAL       :     1.175635 sec
+     3,460,928,709      cycles                           #    2.933 GHz                    
+     7,606,660,397      instructions                     #    2.20  insn per cycle         
+       1.180756657 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.024286e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.190805e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.190805e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.939659e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.155071e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.155071e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.105549 sec
-     3,254,375,411      cycles                           #    2.932 GHz                    
-     7,215,571,393      instructions                     #    2.22  insn per cycle         
-       1.110519445 seconds time elapsed
+TOTAL       :     1.139254 sec
+     3,252,781,739      cycles                           #    2.845 GHz                    
+     7,214,861,555      instructions                     #    2.22  insn per cycle         
+       1.144377149 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.518662e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.361331e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.361331e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.585994e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.448568e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.448568e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.478873 sec
-     3,068,230,484      cycles                           #    2.069 GHz                    
-     5,846,211,473      instructions                     #    1.91  insn per cycle         
-       1.484040601 seconds time elapsed
+TOTAL       :     1.467081 sec
+     3,063,258,508      cycles                           #    2.082 GHz                    
+     5,845,738,451      instructions                     #    1.91  insn per cycle         
+       1.472345808 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
index bb838a2196..415792c712 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:58:39
+DATE: 2023-11-09_18:16:53
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.130902e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.643491e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.939128e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.158996e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.650796e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.951969e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.604908 sec
-     2,484,417,262      cycles                           #    3.021 GHz                    
-     3,852,149,899      instructions                     #    1.55  insn per cycle         
-       0.881326202 seconds time elapsed
+TOTAL       :     0.604748 sec
+     2,477,150,875      cycles                           #    3.008 GHz                    
+     3,827,452,997      instructions                     #    1.55  insn per cycle         
+       0.882802717 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,14 +79,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.328989e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.404078e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.404078e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.278227e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.352095e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.352095e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.586292 sec
-    14,210,336,618      cycles                           #    3.096 GHz                    
-    38,392,847,533      instructions                     #    2.70  insn per cycle         
-       4.591549142 seconds time elapsed
+TOTAL       :     4.688006 sec
+    14,149,964,703      cycles                           #    3.016 GHz                    
+    38,393,052,805      instructions                     #    2.71  insn per cycle         
+       4.693060305 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  587) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe
@@ -106,14 +106,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.239674e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.668279e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.668279e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.195001e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.620625e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.620625e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.084661 sec
-     6,470,762,281      cycles                           #    3.098 GHz                    
-    15,829,570,536      instructions                     #    2.45  insn per cycle         
-       2.089664033 seconds time elapsed
+TOTAL       :     2.102860 sec
+     6,473,914,859      cycles                           #    3.072 GHz                    
+    15,829,595,595      instructions                     #    2.45  insn per cycle         
+       2.107994821 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2689) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe
@@ -133,14 +133,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.589227e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.103396e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.103396e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.498213e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.092419e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.092419e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.175545 sec
-     3,466,544,418      cycles                           #    2.938 GHz                    
-     7,606,584,140      instructions                     #    2.19  insn per cycle         
-       1.180575347 seconds time elapsed
+TOTAL       :     1.186261 sec
+     3,464,671,010      cycles                           #    2.910 GHz                    
+     7,606,636,115      instructions                     #    2.20  insn per cycle         
+       1.191422669 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3051) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.024662e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.193480e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.193480e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.018341e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.184021e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.184021e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.105660 sec
-     3,258,740,690      cycles                           #    2.936 GHz                    
-     7,215,101,525      instructions                     #    2.21  insn per cycle         
-       1.110765672 seconds time elapsed
+TOTAL       :     1.113056 sec
+     3,253,634,801      cycles                           #    2.912 GHz                    
+     7,214,825,947      instructions                     #    2.22  insn per cycle         
+       1.118242022 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2850) (512y:   23) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe
@@ -187,14 +187,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.584208e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.436586e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.436586e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.525206e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.371211e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.371211e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.465958 sec
-     3,064,168,908      cycles                           #    2.084 GHz                    
-     5,845,466,179      instructions                     #    1.91  insn per cycle         
-       1.471139277 seconds time elapsed
+TOTAL       :     1.477022 sec
+     3,066,754,541      cycles                           #    2.070 GHz                    
+     5,845,673,759      instructions                     #    1.91  insn per cycle         
+       1.482222084 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2364) (512y:   24) (512z: 1889)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
index d667b6dbf4..dbd0c88759 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:19:44
+DATE: 2023-11-09_17:41:04
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.049999e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.742417e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.025106e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.062894e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.751636e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.032491e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.475958 sec
-     2,061,164,716      cycles                           #    2.907 GHz                    
-     2,917,299,650      instructions                     #    1.42  insn per cycle         
-       0.766837667 seconds time elapsed
+TOTAL       :     0.471370 sec
+     2,069,916,039      cycles                           #    2.986 GHz                    
+     2,893,797,319      instructions                     #    1.40  insn per cycle         
+       0.749930288 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.217835e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.287538e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.287538e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.241816e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.314075e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.314075e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.813699 sec
-    14,428,562,676      cycles                           #    2.998 GHz                    
-    39,888,508,384      instructions                     #    2.76  insn per cycle         
-       4.818824247 seconds time elapsed
+TOTAL       :     4.763197 sec
+    14,419,363,408      cycles                           #    3.025 GHz                    
+    39,885,822,805      instructions                     #    2.77  insn per cycle         
+       4.768145939 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  570) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.957468e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.536679e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.536679e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.077159e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.666017e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.666017e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     1.845039 sec
-     5,590,599,138      cycles                           #    3.023 GHz                    
-    15,299,534,426      instructions                     #    2.74  insn per cycle         
-       1.850198462 seconds time elapsed
+TOTAL       :     1.809175 sec
+     5,591,744,554      cycles                           #    3.083 GHz                    
+    15,300,029,522      instructions                     #    2.74  insn per cycle         
+       1.814409785 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2473) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.651061e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.332537e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.332537e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.801496e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.504366e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.504366e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.660892 sec
-     4,740,556,619      cycles                           #    2.846 GHz                    
-     9,747,822,441      instructions                     #    2.06  insn per cycle         
-       1.666191221 seconds time elapsed
+TOTAL       :     1.624464 sec
+     4,741,141,330      cycles                           #    2.911 GHz                    
+     9,747,661,132      instructions                     #    2.06  insn per cycle         
+       1.629561959 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3710) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.778515e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.494686e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.494686e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.005329e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.745480e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.745480e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.631450 sec
-     4,628,439,590      cycles                           #    2.829 GHz                    
-     9,339,816,116      instructions                     #    2.02  insn per cycle         
-       1.636603727 seconds time elapsed
+TOTAL       :     1.578447 sec
+     4,623,271,493      cycles                           #    2.921 GHz                    
+     9,339,033,786      instructions                     #    2.02  insn per cycle         
+       1.583594825 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3497) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.981004e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.517698e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.517698e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.210537e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.774289e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.774289e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.837853 sec
-     3,663,588,168      cycles                           #    1.989 GHz                    
-     7,045,799,249      instructions                     #    1.92  insn per cycle         
-       1.843187351 seconds time elapsed
+TOTAL       :     1.770504 sec
+     3,648,791,259      cycles                           #    2.056 GHz                    
+     7,045,498,641      instructions                     #    1.93  insn per cycle         
+       1.775670307 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2606) (512y:   12) (512z: 2221)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
index e94beeddac..c0790b6e36 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:42:28
+DATE: 2023-11-09_18:00:53
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.362873e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.640443e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.957691e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.858794e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.673199e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.981057e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.478743 sec
-     2,066,773,251      cycles                           #    2.940 GHz                    
-     2,882,191,672      instructions                     #    1.39  insn per cycle         
-       0.760603829 seconds time elapsed
+TOTAL       :     0.474263 sec
+     2,125,063,536      cycles                           #    3.002 GHz                    
+     3,025,852,918      instructions                     #    1.42  insn per cycle         
+       0.764897313 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.571240e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.665961e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.665961e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.589894e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.682971e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.682971e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.163303 sec
-    12,605,463,394      cycles                           #    3.025 GHz                    
-    34,393,608,512      instructions                     #    2.73  insn per cycle         
-       4.168641817 seconds time elapsed
+TOTAL       :     4.133418 sec
+    12,609,458,975      cycles                           #    3.048 GHz                    
+    34,395,001,210      instructions                     #    2.73  insn per cycle         
+       4.138439483 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  696) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.401759e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.886488e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.886488e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.435122e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.914251e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.914251e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.027469 sec
-     6,100,742,722      cycles                           #    3.002 GHz                    
-    14,874,619,740      instructions                     #    2.44  insn per cycle         
-       2.032997684 seconds time elapsed
+TOTAL       :     2.014773 sec
+     6,085,710,075      cycles                           #    3.014 GHz                    
+    14,874,327,590      instructions                     #    2.44  insn per cycle         
+       2.020198945 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3009) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.152588e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.984648e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.984648e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.550169e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.423492e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.423492e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.570348 sec
-     4,280,521,919      cycles                           #    2.743 GHz                    
-     9,042,316,644      instructions                     #    2.11  insn per cycle         
-       1.575934676 seconds time elapsed
+TOTAL       :     1.471458 sec
+     4,290,277,982      cycles                           #    2.907 GHz                    
+     9,041,954,393      instructions                     #    2.11  insn per cycle         
+       1.476543510 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4445) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.548985e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.445828e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.445828e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.705610e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.621776e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.621776e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.472831 sec
-     4,206,089,473      cycles                           #    2.847 GHz                    
-     8,677,889,358      instructions                     #    2.06  insn per cycle         
-       1.478375348 seconds time elapsed
+TOTAL       :     1.443048 sec
+     4,208,694,980      cycles                           #    2.909 GHz                    
+     8,677,287,895      instructions                     #    2.06  insn per cycle         
+       1.448442097 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4244) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.660562e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.137441e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.137441e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.842247e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.341676e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.341676e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.938115 sec
-     3,846,715,012      cycles                           #    1.980 GHz                    
-     7,820,097,651      instructions                     #    2.03  insn per cycle         
-       1.943482590 seconds time elapsed
+TOTAL       :     1.878702 sec
+     3,847,091,668      cycles                           #    2.044 GHz                    
+     7,820,914,226      instructions                     #    2.03  insn per cycle         
+       1.883936977 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4420) (512y:    0) (512z: 2556)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
index a8a81cca05..a8fdecb532 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:42:52
+DATE: 2023-11-09_18:01:17
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.468219e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.688670e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.018561e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.862525e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.715295e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.030318e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086718e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.479145 sec
-     2,060,928,745      cycles                           #    2.937 GHz                    
-     2,943,965,642      instructions                     #    1.43  insn per cycle         
-       0.760902085 seconds time elapsed
+TOTAL       :     0.474547 sec
+     2,129,920,938      cycles                           #    3.015 GHz                    
+     3,022,843,622      instructions                     #    1.42  insn per cycle         
+       0.763729431 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127
@@ -77,14 +77,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.752408e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.860428e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.860428e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.720126e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.824009e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.824009e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     3.895863 sec
-    11,764,358,308      cycles                           #    3.017 GHz                    
-    35,130,105,613      instructions                     #    2.99  insn per cycle         
-       3.901121829 seconds time elapsed
+TOTAL       :     3.938814 sec
+    11,787,930,920      cycles                           #    2.995 GHz                    
+    35,134,515,128      instructions                     #    2.98  insn per cycle         
+       3.943783291 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  470) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.491671e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.980976e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.980976e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.688740e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.207831e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.207831e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     1.995272 sec
-     5,963,721,442      cycles                           #    2.982 GHz                    
-    14,483,479,258      instructions                     #    2.43  insn per cycle         
-       2.000909308 seconds time elapsed
+TOTAL       :     1.927645 sec
+     5,955,477,747      cycles                           #    3.083 GHz                    
+    14,483,875,890      instructions                     #    2.43  insn per cycle         
+       1.932605425 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2572) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.606859e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.529662e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.529662e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.792092e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.717382e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.717382e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.463863 sec
-     4,171,268,875      cycles                           #    2.840 GHz                    
-     8,887,248,415      instructions                     #    2.13  insn per cycle         
-       1.469508622 seconds time elapsed
+TOTAL       :     1.428222 sec
+     4,172,426,658      cycles                           #    2.912 GHz                    
+     8,888,638,577      instructions                     #    2.13  insn per cycle         
+       1.433579963 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3576) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.334017e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.185528e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.185528e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.830183e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.777369e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.777369e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.515911 sec
-     4,141,896,373      cycles                           #    2.724 GHz                    
-     8,425,434,947      instructions                     #    2.03  insn per cycle         
-       1.521361653 seconds time elapsed
+TOTAL       :     1.421326 sec
+     4,143,555,691      cycles                           #    2.906 GHz                    
+     8,424,122,393      instructions                     #    2.03  insn per cycle         
+       1.426420575 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3320) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.735035e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.250427e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.250427e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.911357e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.422090e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.422090e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.913707 sec
-     3,815,274,575      cycles                           #    1.989 GHz                    
-     7,713,047,642      instructions                     #    2.02  insn per cycle         
-       1.919181973 seconds time elapsed
+TOTAL       :     1.856974 sec
+     3,783,077,119      cycles                           #    2.033 GHz                    
+     7,713,045,733      instructions                     #    2.04  insn per cycle         
+       1.862087187 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3436) (512y:    0) (512z: 2108)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 1d637e1269..bc7d9de588 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:20:08
+DATE: 2023-11-09_17:41:28
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.064819e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.168761e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.265943e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.109904e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.171630e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.269382e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.516845 sec
-     2,194,660,841      cycles                           #    2.941 GHz                    
-     3,161,612,621      instructions                     #    1.44  insn per cycle         
-       0.804942538 seconds time elapsed
+TOTAL       :     0.511961 sec
+     2,222,105,943      cycles                           #    3.001 GHz                    
+     3,180,029,506      instructions                     #    1.43  insn per cycle         
+       0.797868325 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.076007e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.135159e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.135159e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.142732e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.204413e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.204413e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.157074 sec
-    15,456,785,340      cycles                           #    2.995 GHz                    
-    38,638,875,955      instructions                     #    2.50  insn per cycle         
-       5.162652658 seconds time elapsed
+TOTAL       :     4.999043 sec
+    15,266,738,883      cycles                           #    3.052 GHz                    
+    38,639,692,678      instructions                     #    2.53  insn per cycle         
+       5.004417103 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  672) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.689929e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.902707e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.902707e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.675686e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.874485e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.874485e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.947066 sec
-     8,960,192,906      cycles                           #    3.035 GHz                    
-    24,239,204,206      instructions                     #    2.71  insn per cycle         
-       2.952599117 seconds time elapsed
+TOTAL       :     2.956696 sec
+     8,943,278,567      cycles                           #    3.020 GHz                    
+    24,239,461,473      instructions                     #    2.71  insn per cycle         
+       2.961985342 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2188) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.870612e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.391820e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.391820e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.810568e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.309343e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.309343e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.891319 sec
-     5,424,929,342      cycles                           #    2.862 GHz                    
-    11,287,630,140      instructions                     #    2.08  insn per cycle         
-       1.896741262 seconds time elapsed
+TOTAL       :     1.907942 sec
+     5,390,382,442      cycles                           #    2.818 GHz                    
+    11,287,870,279      instructions                     #    2.09  insn per cycle         
+       1.913175131 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2480) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.626799e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.289896e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.289896e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.736389e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.412733e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.412733e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.686295 sec
-     4,842,859,663      cycles                           #    2.864 GHz                    
-    10,535,885,470      instructions                     #    2.18  insn per cycle         
-       1.691658185 seconds time elapsed
+TOTAL       :     1.660808 sec
+     4,859,407,660      cycles                           #    2.918 GHz                    
+    10,535,709,652      instructions                     #    2.17  insn per cycle         
+       1.666185530 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2167) (512y:  148) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.120532e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.365927e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.365927e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.170238e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.418556e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.418556e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.650947 sec
-     5,210,620,634      cycles                           #    1.962 GHz                    
-     7,614,639,902      instructions                     #    1.46  insn per cycle         
-       2.656437650 seconds time elapsed
+TOTAL       :     2.618902 sec
+     5,253,729,468      cycles                           #    2.003 GHz                    
+     7,613,729,309      instructions                     #    1.45  insn per cycle         
+       2.624316082 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1633) (512y:  126) (512z: 1608)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
index 92e3c9f0b5..008a5e172d 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2023-11-08_21:20:35
+DATE: 2023-11-09_17:41:55
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.066522e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.173508e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.273022e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.128890e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.181968e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.279178e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.512769 sec
-     2,197,876,209      cycles                           #    2.961 GHz                    
-     3,170,940,757      instructions                     #    1.44  insn per cycle         
-       0.799563998 seconds time elapsed
+TOTAL       :     0.513214 sec
+     2,219,973,022      cycles                           #    2.991 GHz                    
+     3,202,428,118      instructions                     #    1.44  insn per cycle         
+       0.799522630 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
@@ -77,14 +77,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.111886e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.172848e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.172848e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.124085e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.184530e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.184530e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.069953 sec
-    15,385,884,321      cycles                           #    3.032 GHz                    
-    40,433,272,287      instructions                     #    2.63  insn per cycle         
-       5.075349465 seconds time elapsed
+TOTAL       :     5.039921 sec
+    15,384,037,518      cycles                           #    3.050 GHz                    
+    40,433,132,851      instructions                     #    2.63  insn per cycle         
+       5.045085372 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe
@@ -104,14 +104,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.654822e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.859127e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.859127e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.855191e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.079392e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.079392e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.975229 sec
-     8,506,893,399      cycles                           #    2.855 GHz                    
-    23,270,886,855      instructions                     #    2.74  insn per cycle         
-       2.980696937 seconds time elapsed
+TOTAL       :     2.823965 sec
+     8,503,215,845      cycles                           #    3.006 GHz                    
+    23,269,764,862      instructions                     #    2.74  insn per cycle         
+       2.829223148 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 2091) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe
@@ -131,14 +131,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.053911e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.431363e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.431363e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.125017e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.510855e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.510855e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.179721 sec
-     6,241,572,834      cycles                           #    2.857 GHz                    
-    12,973,482,438      instructions                     #    2.08  insn per cycle         
-       2.185137091 seconds time elapsed
+TOTAL       :     2.149257 sec
+     6,265,408,652      cycles                           #    2.910 GHz                    
+    12,973,997,697      instructions                     #    2.07  insn per cycle         
+       2.154583439 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2669) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe
@@ -158,14 +158,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.331614e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.744905e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.744905e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.427179e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.860121e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.860121e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.072194 sec
-     5,929,542,555      cycles                           #    2.855 GHz                    
-    12,251,825,862      instructions                     #    2.07  insn per cycle         
-       2.077717224 seconds time elapsed
+TOTAL       :     2.035544 sec
+     5,944,578,726      cycles                           #    2.915 GHz                    
+    12,250,352,313      instructions                     #    2.06  insn per cycle         
+       2.040880399 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2209) (512y:  296) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe
@@ -185,14 +185,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.800727e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.013912e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.013912e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.896609e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.113493e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.113493e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.863923 sec
-     5,611,513,288      cycles                           #    1.956 GHz                    
-     8,753,901,381      instructions                     #    1.56  insn per cycle         
-       2.869313331 seconds time elapsed
+TOTAL       :     2.794321 sec
+     5,604,210,205      cycles                           #    2.003 GHz                    
+     8,753,670,387      instructions                     #    1.56  insn per cycle         
+       2.799501421 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1490) (512y:  183) (512z: 1909)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 87df63c965..a6a310dca7 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-08_21:21:03
+DATE: 2023-11-09_17:42:23
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.879738e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.041736e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.055795e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.987135e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.050792e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.063302e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.461849 sec
-     1,973,375,466      cycles                           #    2.915 GHz                    
-     2,850,187,396      instructions                     #    1.44  insn per cycle         
-       0.733799311 seconds time elapsed
+TOTAL       :     0.461215 sec
+     2,013,982,440      cycles                           #    2.996 GHz                    
+     2,888,271,641      instructions                     #    1.43  insn per cycle         
+       0.731639311 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.114902e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.320626e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.332328e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.121271e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.323663e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.335167e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.597626 sec
-     2,460,714,562      cycles                           #    2.956 GHz                    
-     3,716,258,767      instructions                     #    1.51  insn per cycle         
-       0.892242937 seconds time elapsed
+TOTAL       :     0.596567 sec
+     2,489,603,363      cycles                           #    2.997 GHz                    
+     3,769,346,991      instructions                     #    1.51  insn per cycle         
+       0.890614911 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.537254e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.549613e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.549613e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.576698e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.589005e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.589005e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.480284 sec
-    19,731,245,814      cycles                           #    3.044 GHz                    
-    59,610,628,892      instructions                     #    3.02  insn per cycle         
-       6.484553626 seconds time elapsed
+TOTAL       :     6.380855 sec
+    19,728,048,826      cycles                           #    3.090 GHz                    
+    59,610,032,345      instructions                     #    3.02  insn per cycle         
+       6.384875624 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1466) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.819525e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.864015e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.864015e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.837473e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.882254e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.882254e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.421528 sec
-    10,361,656,121      cycles                           #    3.025 GHz                    
-    30,678,833,436      instructions                     #    2.96  insn per cycle         
-       3.425797412 seconds time elapsed
+TOTAL       :     3.409518 sec
+    10,359,121,121      cycles                           #    3.036 GHz                    
+    30,679,203,213      instructions                     #    2.96  insn per cycle         
+       3.413745701 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5153) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.328413e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.498915e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.498915e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.786469e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.964416e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.964416e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.779184 sec
-     4,885,070,909      cycles                           #    2.740 GHz                    
-    11,021,940,228      instructions                     #    2.26  insn per cycle         
-       1.783393950 seconds time elapsed
+TOTAL       :     1.696222 sec
+     4,887,496,480      cycles                           #    2.875 GHz                    
+    11,021,602,656      instructions                     #    2.26  insn per cycle         
+       1.700511665 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4467) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.089421e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.111598e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111598e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.093744e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.115987e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.115987e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.526514 sec
-     4,365,565,996      cycles                           #    2.854 GHz                    
-    10,298,805,774      instructions                     #    2.36  insn per cycle         
-       1.530732946 seconds time elapsed
+TOTAL       :     1.520406 sec
+     4,369,323,760      cycles                           #    2.867 GHz                    
+    10,298,269,078      instructions                     #    2.36  insn per cycle         
+       1.524718704 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4137) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.324075e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.430754e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.430754e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.753883e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.865687e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.865687e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.262206 sec
-     4,104,673,936      cycles                           #    1.812 GHz                    
-     5,846,278,322      instructions                     #    1.42  insn per cycle         
-       2.266456846 seconds time elapsed
+TOTAL       :     2.137350 sec
+     4,099,012,031      cycles                           #    1.915 GHz                    
+     5,845,815,520      instructions                     #    1.43  insn per cycle         
+       2.141590310 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1540) (512y:   95) (512z: 3466)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
index a8aafca020..47e341807c 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-08_21:52:42
+DATE: 2023-11-09_18:10:58
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.668584e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.838174e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.838174e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.707712e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.862456e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.862456e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.491390 sec
-     2,056,116,630      cycles                           #    2.930 GHz                    
-     3,087,605,373      instructions                     #    1.50  insn per cycle         
-       0.760599439 seconds time elapsed
+TOTAL       :     0.491938 sec
+     2,095,418,329      cycles                           #    2.943 GHz                    
+     3,181,792,573      instructions                     #    1.52  insn per cycle         
+       0.771165711 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.753470e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.636054e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.636054e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.763222e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.617411e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.617411e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.817784 sec
-     3,130,594,447      cycles                           #    2.944 GHz                    
-     4,997,770,241      instructions                     #    1.60  insn per cycle         
-       1.126915791 seconds time elapsed
+TOTAL       :     0.818666 sec
+     3,177,291,822      cycles                           #    2.975 GHz                    
+     5,098,451,441      instructions                     #    1.60  insn per cycle         
+       1.129356217 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.533314e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.546135e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.546135e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.524248e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.536588e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.536588e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.496533 sec
-    19,730,935,453      cycles                           #    3.036 GHz                    
-    59,615,663,798      instructions                     #    3.02  insn per cycle         
-       6.500895427 seconds time elapsed
+TOTAL       :     6.519126 sec
+    19,771,628,211      cycles                           #    3.032 GHz                    
+    59,619,366,283      instructions                     #    3.02  insn per cycle         
+       6.523440391 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1466) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.824473e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.869855e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.869855e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.881918e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.927973e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.927973e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.425054 sec
-    10,403,336,159      cycles                           #    3.035 GHz                    
-    30,728,089,368      instructions                     #    2.95  insn per cycle         
-       3.429466512 seconds time elapsed
+TOTAL       :     3.385706 sec
+    10,402,667,023      cycles                           #    3.069 GHz                    
+    30,728,506,666      instructions                     #    2.95  insn per cycle         
+       3.390173573 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 5153) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.541398e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.724381e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.724381e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.797699e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.978652e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.978652e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.747981 sec
-     4,923,635,172      cycles                           #    2.811 GHz                    
-    11,072,838,099      instructions                     #    2.25  insn per cycle         
-       1.752609449 seconds time elapsed
+TOTAL       :     1.701840 sec
+     4,920,530,137      cycles                           #    2.885 GHz                    
+    11,072,335,054      instructions                     #    2.25  insn per cycle         
+       1.706256708 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4467) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.072827e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.095239e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.095239e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.099458e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.122078e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.122078e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.557290 sec
-     4,408,906,008      cycles                           #    2.824 GHz                    
-    10,349,337,234      instructions                     #    2.35  insn per cycle         
-       1.561766662 seconds time elapsed
+TOTAL       :     1.518361 sec
+     4,398,354,549      cycles                           #    2.890 GHz                    
+    10,347,368,561      instructions                     #    2.35  insn per cycle         
+       1.522642923 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4137) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.462789e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.573036e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.573036e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.773044e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.885749e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.885749e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.226828 sec
-     4,140,433,235      cycles                           #    1.856 GHz                    
-     5,883,947,133      instructions                     #    1.42  insn per cycle         
-       2.231231918 seconds time elapsed
+TOTAL       :     2.139176 sec
+     4,134,059,026      cycles                           #    1.929 GHz                    
+     5,885,050,529      instructions                     #    1.42  insn per cycle         
+       2.143583199 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1540) (512y:   95) (512z: 3466)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
index 2485d7fbb8..de9a4f17b0 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-08_21:21:32
+DATE: 2023-11-09_17:42:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.914793e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.044227e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.057322e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.944811e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.043287e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.055886e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.462395 sec
-     2,001,608,406      cycles                           #    2.941 GHz                    
-     2,866,642,977      instructions                     #    1.43  insn per cycle         
-       0.738112039 seconds time elapsed
+TOTAL       :     0.460458 sec
+     2,026,281,331      cycles                           #    3.005 GHz                    
+     2,900,924,761      instructions                     #    1.43  insn per cycle         
+       0.731473724 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.109030e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.310930e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.322842e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.115492e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.315818e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.327216e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.592309 sec
-     2,454,004,684      cycles                           #    2.967 GHz                    
-     3,701,468,710      instructions                     #    1.51  insn per cycle         
-       0.885901852 seconds time elapsed
+TOTAL       :     0.589562 sec
+     2,467,189,653      cycles                           #    3.006 GHz                    
+     3,742,728,616      instructions                     #    1.52  insn per cycle         
+       0.882301885 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.546247e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.558939e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.558939e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.562701e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.575539e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.575539e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.457597 sec
-    19,573,619,879      cycles                           #    3.030 GHz                    
-    58,802,481,580      instructions                     #    3.00  insn per cycle         
-       6.461777687 seconds time elapsed
+TOTAL       :     6.415806 sec
+    19,556,589,093      cycles                           #    3.047 GHz                    
+    58,802,097,142      instructions                     #    3.01  insn per cycle         
+       6.419943255 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1313) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.793642e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.840400e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.840400e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.964793e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.010479e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.010479e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.440445 sec
-    10,252,301,234      cycles                           #    2.977 GHz                    
-    30,351,085,669      instructions                     #    2.96  insn per cycle         
-       3.444877379 seconds time elapsed
+TOTAL       :     3.321576 sec
+    10,234,879,480      cycles                           #    3.078 GHz                    
+    30,349,718,565      instructions                     #    2.97  insn per cycle         
+       3.325925546 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4970) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.384802e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.551869e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.551869e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.508412e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.675254e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.675254e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.768254 sec
-     5,044,938,195      cycles                           #    2.848 GHz                    
-    11,486,596,301      instructions                     #    2.28  insn per cycle         
-       1.772428896 seconds time elapsed
+TOTAL       :     1.744975 sec
+     5,046,123,954      cycles                           #    2.887 GHz                    
+    11,486,788,981      instructions                     #    2.28  insn per cycle         
+       1.749151834 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4591) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.019018e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.038703e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.038703e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.033659e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.053692e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.053692e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.630183 sec
-     4,647,706,592      cycles                           #    2.845 GHz                    
-    10,845,108,593      instructions                     #    2.33  insn per cycle         
-       1.634411362 seconds time elapsed
+TOTAL       :     1.606653 sec
+     4,645,095,124      cycles                           #    2.885 GHz                    
+    10,843,590,320      instructions                     #    2.33  insn per cycle         
+       1.610949978 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4183) (512y:  244) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.188773e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.290125e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.290125e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.741864e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.853507e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.853507e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.304290 sec
-     4,123,403,300      cycles                           #    1.794 GHz                    
-     6,113,558,333      instructions                     #    1.48  insn per cycle         
-       2.308644720 seconds time elapsed
+TOTAL       :     2.140995 sec
+     4,112,867,345      cycles                           #    1.919 GHz                    
+     6,110,383,002      instructions                     #    1.49  insn per cycle         
+       2.145162136 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1457) (512y:  139) (512z: 3568)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index 0b448796b2..f7b3cf47d9 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-08_21:22:02
+DATE: 2023-11-09_17:43:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.567286e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.376211e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.468457e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.559244e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.332615e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.416599e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.444374 sec
-     1,959,403,583      cycles                           #    2.932 GHz                    
-     2,755,627,615      instructions                     #    1.41  insn per cycle         
-       0.725331091 seconds time elapsed
+TOTAL       :     0.442703 sec
+     1,956,548,432      cycles                           #    2.973 GHz                    
+     2,743,818,395      instructions                     #    1.40  insn per cycle         
+       0.717328196 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.353667e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.408300e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.476909e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.415878e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.488188e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.558288e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630099e+02 +- 4.770719e+02 )  GeV^-2
-TOTAL       :     0.490778 sec
-     2,119,348,519      cycles                           #    2.946 GHz                    
-     3,045,536,225      instructions                     #    1.44  insn per cycle         
-       0.776414109 seconds time elapsed
+TOTAL       :     0.487204 sec
+     2,131,239,677      cycles                           #    3.000 GHz                    
+     3,082,245,234      instructions                     #    1.45  insn per cycle         
+       0.768130616 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        31,825,625      cycles                           #    2.791 GHz                    
-        48,514,379      instructions                     #    1.52  insn per cycle         
-       0.011782396 seconds time elapsed
+        31,971,805      cycles                           #    2.811 GHz                    
+        48,583,386      instructions                     #    1.52  insn per cycle         
+       0.011876482 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1034) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
index 2f35cf010a..e1663755b4 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-08_21:53:11
+DATE: 2023-11-09_18:11:27
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.915722e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.200179e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.200179e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.114759e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.213627e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.213627e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009071e+02 +- 5.002295e+01 )  GeV^-2
-TOTAL       :     0.459965 sec
-     1,913,489,356      cycles                           #    2.854 GHz                    
-     2,835,494,218      instructions                     #    1.48  insn per cycle         
-       0.728586503 seconds time elapsed
+TOTAL       :     0.453382 sec
+     1,979,110,250      cycles                           #    2.985 GHz                    
+     2,941,718,851      instructions                     #    1.49  insn per cycle         
+       0.719982475 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.767536e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.641642e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.641642e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.789515e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.657512e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.657512e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.737500e+02 +- 4.776370e+02 )  GeV^-2
-TOTAL       :     0.634368 sec
-     2,553,649,677      cycles                           #    2.951 GHz                    
-     3,942,242,941      instructions                     #    1.54  insn per cycle         
-       0.922459199 seconds time elapsed
+TOTAL       :     0.632326 sec
+     2,585,787,492      cycles                           #    3.000 GHz                    
+     3,972,159,776      instructions                     #    1.54  insn per cycle         
+       0.920056111 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -99,9 +99,9 @@ OK (relative difference <= 5E-3)
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Instantiate host Bridge (nevt=16384)
-        38,286,300      cycles                           #    2.778 GHz                    
-        51,959,635      instructions                     #    1.36  insn per cycle         
-       0.014194921 seconds time elapsed
+        38,570,643      cycles                           #    2.885 GHz                    
+        52,119,941      instructions                     #    1.35  insn per cycle         
+       0.013856202 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1034) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
index e630fbc27d..e8b37410be 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-08_21:22:11
+DATE: 2023-11-09_17:43:30
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.560442e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.377270e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.470091e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.567326e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.333824e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.424930e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.443339 sec
-     1,943,957,931      cycles                           #    2.944 GHz                    
-     2,765,105,739      instructions                     #    1.42  insn per cycle         
-       0.717258208 seconds time elapsed
+TOTAL       :     0.444883 sec
+     1,998,454,742      cycles                           #    2.980 GHz                    
+     2,813,430,207      instructions                     #    1.41  insn per cycle         
+       0.728667460 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.360432e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.412708e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.481720e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.379215e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.422915e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.490315e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630099e+02 +- 4.770719e+02 )  GeV^-2
-TOTAL       :     0.491895 sec
-     2,104,648,838      cycles                           #    2.938 GHz                    
-     3,025,148,863      instructions                     #    1.44  insn per cycle         
-       0.773979442 seconds time elapsed
+TOTAL       :     0.488237 sec
+     2,124,585,750      cycles                           #    2.987 GHz                    
+     3,077,258,575      instructions                     #    1.45  insn per cycle         
+       0.769041859 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        31,662,761      cycles                           #    2.798 GHz                    
-        47,511,797      instructions                     #    1.50  insn per cycle         
-       0.011712916 seconds time elapsed
+        31,375,066      cycles                           #    2.814 GHz                    
+        47,697,134      instructions                     #    1.52  insn per cycle         
+       0.011523392 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1029) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index e83376e827..aa3d979423 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-08_21:22:21
+DATE: 2023-11-09_17:43:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.888685e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.043488e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.056349e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.974532e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049892e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.062592e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.461575 sec
-     1,992,206,499      cycles                           #    2.947 GHz                    
-     2,868,298,614      instructions                     #    1.44  insn per cycle         
-       0.733257197 seconds time elapsed
+TOTAL       :     0.466235 sec
+     1,982,451,794      cycles                           #    2.881 GHz                    
+     2,904,128,689      instructions                     #    1.46  insn per cycle         
+       0.746029193 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.111138e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.315581e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.327177e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.118841e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.320828e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.332362e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.598628 sec
-     2,465,744,279      cycles                           #    2.958 GHz                    
-     3,812,193,472      instructions                     #    1.55  insn per cycle         
-       0.893336251 seconds time elapsed
+TOTAL       :     0.602851 sec
+     2,418,002,144      cycles                           #    2.873 GHz                    
+     3,684,858,061      instructions                     #    1.52  insn per cycle         
+       0.899181828 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        34,711,490      cycles                           #    2.787 GHz                    
-        50,039,456      instructions                     #    1.44  insn per cycle         
-       0.012986618 seconds time elapsed
+        34,749,440      cycles                           #    2.771 GHz                    
+        50,090,467      instructions                     #    1.44  insn per cycle         
+       0.013126058 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1399) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
index ab62773e76..fa1b7c54dc 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2023-11-08_21:22:30
+DATE: 2023-11-09_17:43:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.840662e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.037949e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.050999e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.943854e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.040668e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.053565e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.462948 sec
-     1,939,550,045      cycles                           #    2.866 GHz                    
-     2,822,181,727      instructions                     #    1.46  insn per cycle         
-       0.733825753 seconds time elapsed
+TOTAL       :     0.466224 sec
+     1,967,855,817      cycles                           #    2.845 GHz                    
+     2,813,069,845      instructions                     #    1.43  insn per cycle         
+       0.750240924 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.102587e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.303113e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.314475e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.108193e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.305249e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.316509e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.591515 sec
-     2,444,078,815      cycles                           #    2.952 GHz                    
-     3,674,116,474      instructions                     #    1.50  insn per cycle         
-       0.887442466 seconds time elapsed
+TOTAL       :     0.594738 sec
+     2,473,036,884      cycles                           #    2.994 GHz                    
+     3,768,499,475      instructions                     #    1.52  insn per cycle         
+       0.886746599 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -86,9 +86,9 @@ OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        34,181,769      cycles                           #    2.772 GHz                    
-        49,201,973      instructions                     #    1.44  insn per cycle         
-       0.012846211 seconds time elapsed
+        34,257,253      cycles                           #    2.793 GHz                    
+        49,140,913      instructions                     #    1.43  insn per cycle         
+       0.012667194 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 1276) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index 0e571e2957..5de2ca45d8 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:22:40
+DATE: 2023-11-09_17:43:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.509565e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.535938e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.538049e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.498898e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.522792e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.525024e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.522429 sec
-     2,216,464,510      cycles                           #    2.948 GHz                    
-     3,445,335,287      instructions                     #    1.55  insn per cycle         
-       0.813178007 seconds time elapsed
+TOTAL       :     0.521868 sec
+     2,246,075,293      cycles                           #    2.975 GHz                    
+     3,415,991,617      instructions                     #    1.52  insn per cycle         
+       0.815510814 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.124490e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.152981e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.154204e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.122388e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.150135e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.151328e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.028693 sec
-     9,700,865,704      cycles                           #    2.960 GHz                    
-    20,299,179,534      instructions                     #    2.09  insn per cycle         
-       3.337900982 seconds time elapsed
+TOTAL       :     3.026853 sec
+     9,913,864,058      cycles                           #    3.024 GHz                    
+    22,195,735,281      instructions                     #    2.24  insn per cycle         
+       3.335346642 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.948157e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.949119e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.949119e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.927075e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.927983e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.927983e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.428390 sec
-    25,658,286,461      cycles                           #    3.043 GHz                    
-    78,943,496,553      instructions                     #    3.08  insn per cycle         
-       8.432674701 seconds time elapsed
+TOTAL       :     8.520375 sec
+    25,675,362,415      cycles                           #    3.013 GHz                    
+    78,943,710,554      instructions                     #    3.07  insn per cycle         
+       8.524455360 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.638426e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.641828e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.641828e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.557363e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.560585e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.560585e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.516511 sec
-    12,940,511,466      cycles                           #    2.863 GHz                    
-    39,286,083,355      instructions                     #    3.04  insn per cycle         
-       4.520821646 seconds time elapsed
+TOTAL       :     4.619361 sec
+    12,935,854,234      cycles                           #    2.798 GHz                    
+    39,286,025,399      instructions                     #    3.04  insn per cycle         
+       4.623706542 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.063000e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.079398e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.079398e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.091948e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.108522e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.108522e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.043453 sec
-     5,578,804,578      cycles                           #    2.725 GHz                    
-    13,689,979,347      instructions                     #    2.45  insn per cycle         
-       2.047766279 seconds time elapsed
+TOTAL       :     2.036281 sec
+     5,584,766,890      cycles                           #    2.738 GHz                    
+    13,690,141,249      instructions                     #    2.45  insn per cycle         
+       2.040702440 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.584845e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.608001e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.608001e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.675809e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.698948e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.698948e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.720447 sec
-     4,895,207,627      cycles                           #    2.839 GHz                    
-    12,344,429,833      instructions                     #    2.52  insn per cycle         
-       1.724685286 seconds time elapsed
+TOTAL       :     1.704074 sec
+     4,897,181,740      cycles                           #    2.868 GHz                    
+    12,344,518,245      instructions                     #    2.52  insn per cycle         
+       1.708309061 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.405020e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.418567e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.418567e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.632146e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.645889e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.645889e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.224337 sec
-     4,116,450,066      cycles                           #    1.848 GHz                    
-     6,337,280,624      instructions                     #    1.54  insn per cycle         
-       2.228619766 seconds time elapsed
+TOTAL       :     2.158505 sec
+     4,118,735,499      cycles                           #    1.905 GHz                    
+     6,336,932,858      instructions                     #    1.54  insn per cycle         
+       2.162776211 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
index 6cfffac867..322fb0150d 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:53:56
+DATE: 2023-11-09_18:12:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.140206e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.481973e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.481973e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.165662e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.477249e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.477249e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.512248 sec
-     2,184,996,199      cycles                           #    2.952 GHz                    
-     3,435,282,796      instructions                     #    1.57  insn per cycle         
-       0.800472589 seconds time elapsed
+TOTAL       :     0.512387 sec
+     2,201,868,575      cycles                           #    2.980 GHz                    
+     3,430,381,187      instructions                     #    1.56  insn per cycle         
+       0.801238529 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.623195e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.099384e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.099384e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.642632e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.111769e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.111769e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.306442 sec
-    10,620,771,247      cycles                           #    2.970 GHz                    
-    24,014,706,294      instructions                     #    2.26  insn per cycle         
-       3.633696672 seconds time elapsed
+TOTAL       :     3.299595 sec
+    10,919,109,000      cycles                           #    3.053 GHz                    
+    24,319,272,982      instructions                     #    2.23  insn per cycle         
+       3.633626468 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.935055e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.935984e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.935984e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.957325e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.958258e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.958258e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.489050 sec
-    25,665,712,522      cycles                           #    3.023 GHz                    
-    78,953,227,075      instructions                     #    3.08  insn per cycle         
-       8.493532453 seconds time elapsed
+TOTAL       :     8.392728 sec
+    25,662,881,797      cycles                           #    3.059 GHz                    
+    78,952,840,684      instructions                     #    3.08  insn per cycle         
+       8.396994023 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.600578e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.604115e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.604115e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.730470e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.733980e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.733980e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.569107 sec
-    12,945,693,806      cycles                           #    2.831 GHz                    
-    39,298,314,532      instructions                     #    3.04  insn per cycle         
-       4.573645709 seconds time elapsed
+TOTAL       :     4.409754 sec
+    12,949,002,647      cycles                           #    2.934 GHz                    
+    39,297,510,156      instructions                     #    3.03  insn per cycle         
+       4.414215325 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.385455e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.402719e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.402719e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.533999e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.551780e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.551780e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.969364 sec
-     5,591,964,229      cycles                           #    2.834 GHz                    
-    13,700,332,532      instructions                     #    2.45  insn per cycle         
-       1.973976640 seconds time elapsed
+TOTAL       :     1.934795 sec
+     5,595,375,698      cycles                           #    2.886 GHz                    
+    13,699,668,832      instructions                     #    2.45  insn per cycle         
+       1.939106700 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.515181e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.538996e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.538996e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.706839e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.728905e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.728905e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.736968 sec
-     4,912,884,670      cycles                           #    2.825 GHz                    
-    12,356,069,233      instructions                     #    2.52  insn per cycle         
-       1.741510676 seconds time elapsed
+TOTAL       :     1.702912 sec
+     4,912,481,885      cycles                           #    2.879 GHz                    
+    12,355,076,796      instructions                     #    2.52  insn per cycle         
+       1.707414472 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.401693e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.415615e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.415615e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.525002e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.540499e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.540499e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.229815 sec
-     4,139,073,894      cycles                           #    1.853 GHz                    
-     6,348,807,900      instructions                     #    1.53  insn per cycle         
-       2.234437952 seconds time elapsed
+TOTAL       :     2.193518 sec
+     4,132,016,890      cycles                           #    1.881 GHz                    
+     6,348,500,069      instructions                     #    1.54  insn per cycle         
+       2.198089448 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
index 829db14182..4e138ec032 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_22:05:32
+DATE: 2023-11-09_18:23:48
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.498087e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.524326e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.526521e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.485315e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.511617e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.513675e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     0.505963 sec
-     2,230,602,584      cycles                           #    2.998 GHz                    
-     3,509,146,743      instructions                     #    1.57  insn per cycle         
-       0.814638005 seconds time elapsed
+TOTAL       :     0.505341 sec
+     2,219,350,607      cycles                           #    2.986 GHz                    
+     3,460,374,619      instructions                     #    1.56  insn per cycle         
+       0.811034575 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.138629e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.170285e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.171692e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.144642e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.176791e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.178152e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.252232e+02 +- 1.234346e+02 )  GeV^-4
-TOTAL       :     3.117149 sec
-    10,263,257,910      cycles                           #    3.044 GHz                    
-    22,984,843,224      instructions                     #    2.24  insn per cycle         
-       3.428387488 seconds time elapsed
+TOTAL       :     3.133190 sec
+    10,226,911,008      cycles                           #    3.021 GHz                    
+    21,462,701,558      instructions                     #    2.10  insn per cycle         
+       3.444111151 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.955282e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.956235e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.956235e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.962376e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.963339e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.963339e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     8.398336 sec
-    25,654,945,707      cycles                           #    3.057 GHz                    
-    78,946,836,924      instructions                     #    3.08  insn per cycle         
-       8.402318295 seconds time elapsed
+TOTAL       :     8.368138 sec
+    25,660,792,563      cycles                           #    3.066 GHz                    
+    78,945,591,899      instructions                     #    3.08  insn per cycle         
+       8.372166508 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.739022e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.742322e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.742322e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.725556e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.729176e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.729176e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     4.397110 sec
-    12,932,706,473      cycles                           #    2.939 GHz                    
-    39,284,078,298      instructions                     #    3.04  insn per cycle         
-       4.401176578 seconds time elapsed
+TOTAL       :     4.413013 sec
+    12,940,530,582      cycles                           #    2.932 GHz                    
+    39,286,713,275      instructions                     #    3.04  insn per cycle         
+       4.417069788 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.547122e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.565515e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.565515e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.541485e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.558659e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.558659e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.929747 sec
-     5,584,587,761      cycles                           #    2.889 GHz                    
-    13,688,784,163      instructions                     #    2.45  insn per cycle         
-       1.933938249 seconds time elapsed
+TOTAL       :     1.930984 sec
+     5,584,027,716      cycles                           #    2.887 GHz                    
+    13,688,917,418      instructions                     #    2.45  insn per cycle         
+       1.935195895 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.712996e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.736353e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.736353e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.785385e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.808420e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.808420e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.699524 sec
-     4,899,825,358      cycles                           #    2.877 GHz                    
-    12,342,496,756      instructions                     #    2.52  insn per cycle         
-       1.703963805 seconds time elapsed
+TOTAL       :     1.686810 sec
+     4,897,782,017      cycles                           #    2.898 GHz                    
+    12,342,341,736      instructions                     #    2.52  insn per cycle         
+       1.690859675 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.584277e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.599062e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.599062e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.578298e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.591405e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.591405e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     2.173644 sec
-     4,127,419,767      cycles                           #    1.897 GHz                    
-     6,336,272,499      instructions                     #    1.54  insn per cycle         
-       2.177878840 seconds time elapsed
+TOTAL       :     2.175070 sec
+     4,121,604,366      cycles                           #    1.892 GHz                    
+     6,334,904,963      instructions                     #    1.54  insn per cycle         
+       2.179001381 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
index 35703491ac..a5bd4bb577 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_22:02:14
+DATE: 2023-11-09_18:20:29
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.483209e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.509549e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.511610e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.495033e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.521313e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.523414e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.505331 sec
-     2,237,004,452      cycles                           #    3.017 GHz                    
-     3,469,560,739      instructions                     #    1.55  insn per cycle         
-       0.813831791 seconds time elapsed
+TOTAL       :     0.502389 sec
+     2,234,960,295      cycles                           #    3.014 GHz                    
+     3,501,182,478      instructions                     #    1.57  insn per cycle         
+       0.813908762 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.137446e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.169549e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.170864e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.146228e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.178481e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.179832e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.063844 sec
-    10,025,654,279      cycles                           #    3.024 GHz                    
-    22,437,691,349      instructions                     #    2.24  insn per cycle         
-       3.371428026 seconds time elapsed
+TOTAL       :     3.070610 sec
+    10,014,430,488      cycles                           #    3.015 GHz                    
+    23,183,698,994      instructions                     #    2.32  insn per cycle         
+       3.378407946 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.972408e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.973332e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.973332e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.972782e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.973730e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.973730e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.324018 sec
-    25,644,049,472      cycles                           #    3.080 GHz                    
-    78,945,889,994      instructions                     #    3.08  insn per cycle         
-       8.328093218 seconds time elapsed
+TOTAL       :     8.322572 sec
+    25,630,767,892      cycles                           #    3.079 GHz                    
+    78,944,418,555      instructions                     #    3.08  insn per cycle         
+       8.326671797 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.757960e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.761409e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.761409e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.718928e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.722195e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.722195e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.373690 sec
-    12,932,578,462      cycles                           #    2.955 GHz                    
-    39,286,223,538      instructions                     #    3.04  insn per cycle         
-       4.377750469 seconds time elapsed
+TOTAL       :     4.419054 sec
+    12,933,087,616      cycles                           #    2.925 GHz                    
+    39,284,437,808      instructions                     #    3.04  insn per cycle         
+       4.423270824 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.504027e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.521553e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.521553e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.554509e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.572221e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.572221e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.937880 sec
-     5,579,002,067      cycles                           #    2.875 GHz                    
-    13,689,941,055      instructions                     #    2.45  insn per cycle         
-       1.941926119 seconds time elapsed
+TOTAL       :     1.926889 sec
+     5,576,123,810      cycles                           #    2.889 GHz                    
+    13,689,166,422      instructions                     #    2.45  insn per cycle         
+       1.931047296 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.762551e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.785341e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.785341e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.729620e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.752389e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.752389e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.689011 sec
-     4,900,729,891      cycles                           #    2.896 GHz                    
-    12,344,260,353      instructions                     #    2.52  insn per cycle         
-       1.693208802 seconds time elapsed
+TOTAL       :     1.694985 sec
+     4,901,721,494      cycles                           #    2.886 GHz                    
+    12,344,869,251      instructions                     #    2.52  insn per cycle         
+       1.699075447 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.678242e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.692622e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.692622e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.451359e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.465184e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.465184e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.144938 sec
-     4,120,050,897      cycles                           #    1.918 GHz                    
-     6,337,719,473      instructions                     #    1.54  insn per cycle         
-       2.149063218 seconds time elapsed
+TOTAL       :     2.210683 sec
+     4,119,158,466      cycles                           #    1.861 GHz                    
+     6,337,202,754      instructions                     #    1.54  insn per cycle         
+       2.214903970 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
index e3bb9b2d2b..e1894928b5 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:59:02
+DATE: 2023-11-09_18:17:17
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.202444e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.496466e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.498519e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.185134e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.497070e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.499968e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.507729 sec
-     2,224,053,547      cycles                           #    2.995 GHz                    
-     3,511,447,697      instructions                     #    1.58  insn per cycle         
-       0.804264263 seconds time elapsed
+TOTAL       :     0.512712 sec
+     2,117,085,337      cycles                           #    2.853 GHz                    
+     3,348,083,687      instructions                     #    1.58  insn per cycle         
+       0.802067553 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.754243e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.177673e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.179050e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.746826e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.178822e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.180194e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.195418 sec
-    10,560,218,824      cycles                           #    3.053 GHz                    
-    23,272,224,469      instructions                     #    2.20  insn per cycle         
-       3.516017944 seconds time elapsed
+TOTAL       :     3.195850 sec
+    10,403,522,722      cycles                           #    3.010 GHz                    
+    22,812,003,731      instructions                     #    2.19  insn per cycle         
+       3.513623861 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -94,14 +94,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.980652e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.981660e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.981660e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.978212e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.979161e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.979161e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.289291 sec
-    25,689,530,854      cycles                           #    3.098 GHz                    
-    78,941,485,494      instructions                     #    3.07  insn per cycle         
-       8.293329329 seconds time elapsed
+TOTAL       :     8.300221 sec
+    25,643,059,514      cycles                           #    3.089 GHz                    
+    78,945,101,648      instructions                     #    3.08  insn per cycle         
+       8.304495187 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4892) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe
@@ -121,14 +121,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.695812e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.699396e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.699396e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.720030e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.723443e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.723443e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.446640 sec
-    12,939,707,143      cycles                           #    2.908 GHz                    
-    39,286,790,527      instructions                     #    3.04  insn per cycle         
-       4.450934428 seconds time elapsed
+TOTAL       :     4.417696 sec
+    12,936,090,694      cycles                           #    2.926 GHz                    
+    39,285,549,332      instructions                     #    3.04  insn per cycle         
+       4.421886330 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13182) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -148,14 +148,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.540564e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.557570e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.557570e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.467679e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.484549e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.484549e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.929715 sec
-     5,584,326,574      cycles                           #    2.891 GHz                    
-    13,690,307,414      instructions                     #    2.45  insn per cycle         
-       1.933841922 seconds time elapsed
+TOTAL       :     1.946291 sec
+     5,575,526,782      cycles                           #    2.860 GHz                    
+    13,689,232,963      instructions                     #    2.46  insn per cycle         
+       1.950526745 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11357) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -175,14 +175,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.772043e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.794673e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.794673e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.714029e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.737204e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.737204e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.687536 sec
-     4,894,600,072      cycles                           #    2.895 GHz                    
-    12,345,111,795      instructions                     #    2.52  insn per cycle         
-       1.691722733 seconds time elapsed
+TOTAL       :     1.697675 sec
+     4,893,869,630      cycles                           #    2.877 GHz                    
+    12,345,121,576      instructions                     #    2.52  insn per cycle         
+       1.701906664 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10266) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe
@@ -202,14 +202,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.667022e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.680748e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.680748e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.624620e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.638794e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.638794e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.148381 sec
-     4,119,534,680      cycles                           #    1.915 GHz                    
-     6,337,066,991      instructions                     #    1.54  insn per cycle         
-       2.152520896 seconds time elapsed
+TOTAL       :     2.160421 sec
+     4,114,771,943      cycles                           #    1.902 GHz                    
+     6,336,936,596      instructions                     #    1.54  insn per cycle         
+       2.164683207 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1821) (512y:  102) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
index 2d6466a5d0..d9a60f4c2d 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:23:17
+DATE: 2023-11-09_17:44:36
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.472415e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.497562e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.499582e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.474117e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.499523e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.501625e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.522603 sec
-     2,199,879,357      cycles                           #    2.926 GHz                    
-     3,406,329,945      instructions                     #    1.55  insn per cycle         
-       0.812598895 seconds time elapsed
+TOTAL       :     0.521490 sec
+     2,250,098,032      cycles                           #    2.995 GHz                    
+     3,547,618,625      instructions                     #    1.58  insn per cycle         
+       0.811334512 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.151978e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.180898e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.182114e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.144032e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.172097e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.173315e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.012462 sec
-     9,824,681,781      cycles                           #    3.013 GHz                    
-    20,251,773,236      instructions                     #    2.06  insn per cycle         
-       3.320916673 seconds time elapsed
+TOTAL       :     3.014234 sec
+     9,779,736,194      cycles                           #    2.987 GHz                    
+    19,303,224,180      instructions                     #    1.97  insn per cycle         
+       3.330161859 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.948786e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.949722e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.949722e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.971538e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.972487e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.972487e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.425425 sec
-    25,600,858,897      cycles                           #    3.038 GHz                    
-    78,714,675,174      instructions                     #    3.07  insn per cycle         
-       8.429623210 seconds time elapsed
+TOTAL       :     8.327938 sec
+    25,611,620,219      cycles                           #    3.074 GHz                    
+    78,715,429,796      instructions                     #    3.07  insn per cycle         
+       8.332111280 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4263) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.648721e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.652034e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.652034e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.709838e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.713193e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.713193e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.503525 sec
-    12,897,071,716      cycles                           #    2.862 GHz                    
-    39,231,170,693      instructions                     #    3.04  insn per cycle         
-       4.507786711 seconds time elapsed
+TOTAL       :     4.429736 sec
+    12,908,947,595      cycles                           #    2.912 GHz                    
+    39,230,824,629      instructions                     #    3.04  insn per cycle         
+       4.433832156 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:12949) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.358235e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.375211e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.375211e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.184366e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.200734e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.200734e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.971459 sec
-     5,607,121,481      cycles                           #    2.839 GHz                    
-    13,803,544,350      instructions                     #    2.46  insn per cycle         
-       1.975775051 seconds time elapsed
+TOTAL       :     2.013363 sec
+     5,615,451,412      cycles                           #    2.785 GHz                    
+    13,804,151,174      instructions                     #    2.46  insn per cycle         
+       2.017493867 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11422) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.338508e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.360185e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.360185e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.496512e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.518383e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.518383e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.768893 sec
-     4,962,697,559      cycles                           #    2.805 GHz                    
-    12,469,802,045      instructions                     #    2.51  insn per cycle         
-       1.786199910 seconds time elapsed
+TOTAL       :     1.736002 sec
+     4,961,501,370      cycles                           #    2.852 GHz                    
+    12,469,539,646      instructions                     #    2.51  insn per cycle         
+       1.740286680 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10258) (512y:  240) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.426426e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.440315e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.440315e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.549305e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.563023e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.563023e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.218010 sec
-     4,123,694,980      cycles                           #    1.856 GHz                    
-     6,461,412,200      instructions                     #    1.57  insn per cycle         
-       2.222394946 seconds time elapsed
+TOTAL       :     2.181875 sec
+     4,116,495,870      cycles                           #    1.884 GHz                    
+     6,461,064,172      instructions                     #    1.57  insn per cycle         
+       2.186117492 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1647) (512y:  192) (512z: 9375)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
index a4e352ee76..909bf4e735 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:43:16
+DATE: 2023-11-09_18:01:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.232524e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.256814e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.259170e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.239370e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.263076e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.265061e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.534744 sec
-     2,248,485,126      cycles                           #    2.941 GHz                    
-     3,494,101,027      instructions                     #    1.55  insn per cycle         
-       0.823969121 seconds time elapsed
+TOTAL       :     0.531588 sec
+     2,281,405,083      cycles                           #    2.976 GHz                    
+     3,558,676,633      instructions                     #    1.56  insn per cycle         
+       0.825879944 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.777627e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.804807e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.805966e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.775154e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.802017e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.803118e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.297104 sec
-    10,673,501,263      cycles                           #    3.005 GHz                    
-    24,226,094,920      instructions                     #    2.27  insn per cycle         
-       3.607615064 seconds time elapsed
+TOTAL       :     3.293832 sec
+    10,794,008,612      cycles                           #    3.043 GHz                    
+    23,569,569,961      instructions                     #    2.18  insn per cycle         
+       3.607202529 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.346513e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.346993e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.346993e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.420862e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.421336e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.421336e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    37.741985 sec
-   113,582,106,901      cycles                           #    3.009 GHz                    
-   144,968,769,114      instructions                     #    1.28  insn per cycle         
-      37.746219696 seconds time elapsed
+TOTAL       :    37.106861 sec
+   113,630,776,289      cycles                           #    3.063 GHz                    
+   144,980,863,935      instructions                     #    1.28  insn per cycle         
+      37.110990461 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:21605) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.143430e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.145919e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.145919e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.245783e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.248348e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.248348e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.226537 sec
-    14,726,949,716      cycles                           #    2.816 GHz                    
-    37,578,521,140      instructions                     #    2.55  insn per cycle         
-       5.230978594 seconds time elapsed
+TOTAL       :     5.061979 sec
+    14,717,920,983      cycles                           #    2.906 GHz                    
+    37,577,837,464      instructions                     #    2.55  insn per cycle         
+       5.066177833 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:68118) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.619134e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.633428e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.633428e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.791579e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.806069e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.806069e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.162328 sec
-     6,132,958,052      cycles                           #    2.832 GHz                    
-    13,063,746,182      instructions                     #    2.13  insn per cycle         
-       2.166766443 seconds time elapsed
+TOTAL       :     2.114146 sec
+     6,120,754,225      cycles                           #    2.890 GHz                    
+    13,063,521,271      instructions                     #    2.13  insn per cycle         
+       2.118343855 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:46960) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.242664e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.263271e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.263271e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.380050e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.401402e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.401402e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.783918 sec
-     5,064,574,027      cycles                           #    2.835 GHz                    
-    11,442,541,397      instructions                     #    2.26  insn per cycle         
-       1.788276031 seconds time elapsed
+TOTAL       :     1.757697 sec
+     5,060,306,566      cycles                           #    2.873 GHz                    
+    11,442,262,844      instructions                     #    2.26  insn per cycle         
+       1.761841609 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:40434) (512y:  285) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.693472e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.708550e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.708550e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.755291e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.769173e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.769173e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.141610 sec
-     3,984,341,945      cycles                           #    1.859 GHz                    
-     5,944,587,769      instructions                     #    1.49  insn per cycle         
-       2.145941939 seconds time elapsed
+TOTAL       :     2.124539 sec
+     3,983,245,523      cycles                           #    1.872 GHz                    
+     5,944,184,553      instructions                     #    1.49  insn per cycle         
+       2.128814459 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2455) (512y:  337) (512z:39411)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
index c9a3c0bc00..8be167a2b3 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:44:25
+DATE: 2023-11-09_18:02:48
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.238547e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.263632e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.265593e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.258787e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.282651e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.285304e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.528864 sec
-     2,246,214,726      cycles                           #    2.961 GHz                    
-     3,512,868,349      instructions                     #    1.56  insn per cycle         
-       0.816400547 seconds time elapsed
+TOTAL       :     0.525087 sec
+     2,271,033,028      cycles                           #    3.019 GHz                    
+     3,503,626,972      instructions                     #    1.54  insn per cycle         
+       0.810303022 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.792504e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.819675e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.820783e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.795218e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.822430e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.823559e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.270254 sec
-    10,633,900,320      cycles                           #    3.014 GHz                    
-    24,514,837,826      instructions                     #    2.31  insn per cycle         
-       3.584387558 seconds time elapsed
+TOTAL       :     3.267432 sec
+    10,775,752,309      cycles                           #    3.062 GHz                    
+    23,804,895,620      instructions                     #    2.21  insn per cycle         
+       3.575584891 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.327617e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.328084e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.328084e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.382161e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.382658e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.382658e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    37.906136 sec
-   114,405,747,001      cycles                           #    3.018 GHz                    
-   145,562,165,740      instructions                     #    1.27  insn per cycle         
-      37.910396057 seconds time elapsed
+TOTAL       :    37.434460 sec
+   114,573,902,263      cycles                           #    3.060 GHz                    
+   145,559,795,063      instructions                     #    1.27  insn per cycle         
+      37.438717752 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:22248) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.120905e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.123383e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.123383e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.172461e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.174968e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.174968e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.264434 sec
-    15,164,870,179      cycles                           #    2.879 GHz                    
-    37,765,103,372      instructions                     #    2.49  insn per cycle         
-       5.268658441 seconds time elapsed
+TOTAL       :     5.178309 sec
+    15,150,664,399      cycles                           #    2.924 GHz                    
+    37,765,142,558      instructions                     #    2.49  insn per cycle         
+       5.182585019 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:68446) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.815263e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.829969e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.829969e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.899691e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.915108e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.915108e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.107998 sec
-     6,006,546,140      cycles                           #    2.845 GHz                    
-    12,898,448,008      instructions                     #    2.15  insn per cycle         
-       2.112261899 seconds time elapsed
+TOTAL       :     2.085123 sec
+     6,007,372,451      cycles                           #    2.876 GHz                    
+    12,897,891,125      instructions                     #    2.15  insn per cycle         
+       2.089322243 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:45929) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.170106e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.191645e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.191645e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.290925e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.312116e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.312116e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.798019 sec
-     5,110,595,937      cycles                           #    2.837 GHz                    
-    11,448,746,145      instructions                     #    2.24  insn per cycle         
-       1.802331588 seconds time elapsed
+TOTAL       :     1.774574 sec
+     5,109,183,395      cycles                           #    2.874 GHz                    
+    11,448,665,866      instructions                     #    2.24  insn per cycle         
+       1.778819443 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:40123) (512y:  219) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.719086e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.733849e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.733849e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.900466e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.915540e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.915540e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.134583 sec
-     3,969,461,110      cycles                           #    1.857 GHz                    
-     5,897,831,571      instructions                     #    1.49  insn per cycle         
-       2.138816528 seconds time elapsed
+TOTAL       :     2.085227 sec
+     3,957,731,000      cycles                           #    1.895 GHz                    
+     5,897,967,734      instructions                     #    1.49  insn per cycle         
+       2.089481596 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1971) (512y:  259) (512z:38937)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 9c1de01f16..24e6fadbe8 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:23:53
+DATE: 2023-11-09_17:45:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.293342e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.339166e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.344348e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.337209e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.383457e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.391632e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.481289 sec
-     2,043,429,418      cycles                           #    2.945 GHz                    
-     3,016,391,404      instructions                     #    1.48  insn per cycle         
-       0.753087040 seconds time elapsed
+TOTAL       :     0.480161 sec
+     2,056,195,749      cycles                           #    2.969 GHz                    
+     3,041,501,171      instructions                     #    1.48  insn per cycle         
+       0.751973888 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.613713e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.676727e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.679629e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.613057e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.675362e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.678111e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.713007 sec
-     5,846,211,987      cycles                           #    2.997 GHz                    
-    12,059,135,892      instructions                     #    2.06  insn per cycle         
-       2.007812305 seconds time elapsed
+TOTAL       :     1.713246 sec
+     5,908,983,228      cycles                           #    3.045 GHz                    
+    11,684,311,184      instructions                     #    1.98  insn per cycle         
+       1.997404675 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.005115e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.006106e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.006106e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.054709e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.055772e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.055772e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.187511 sec
-    24,627,671,323      cycles                           #    3.007 GHz                    
-    78,134,663,224      instructions                     #    3.17  insn per cycle         
-       8.191568767 seconds time elapsed
+TOTAL       :     7.990756 sec
+    24,645,365,645      cycles                           #    3.083 GHz                    
+    78,136,702,059      instructions                     #    3.17  insn per cycle         
+       7.994878538 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.313136e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.326827e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.326827e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.432830e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.446994e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.446994e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.250414 sec
-     6,477,846,372      cycles                           #    2.874 GHz                    
-    20,124,481,745      instructions                     #    3.11  insn per cycle         
-       2.254575609 seconds time elapsed
+TOTAL       :     2.213938 sec
+     6,478,911,538      cycles                           #    2.922 GHz                    
+    20,124,199,414      instructions                     #    3.11  insn per cycle         
+       2.218115274 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.651750e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.658578e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.658578e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.680617e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.687674e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.687674e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.000733 sec
-     2,836,203,846      cycles                           #    2.824 GHz                    
-     6,991,580,060      instructions                     #    2.47  insn per cycle         
-       1.005051926 seconds time elapsed
+TOTAL       :     0.983889 sec
+     2,838,821,051      cycles                           #    2.875 GHz                    
+     6,991,598,423      instructions                     #    2.46  insn per cycle         
+       0.988065526 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.891596e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.900607e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.900607e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.841366e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.850029e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.850029e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.874979 sec
-     2,489,876,695      cycles                           #    2.834 GHz                    
-     6,298,919,091      instructions                     #    2.53  insn per cycle         
-       0.879145628 seconds time elapsed
+TOTAL       :     0.898688 sec
+     2,488,990,380      cycles                           #    2.759 GHz                    
+     6,298,918,188      instructions                     #    2.53  insn per cycle         
+       0.902843603 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.492404e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.498044e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.498044e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.538961e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.547910e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.547910e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.107211 sec
-     2,056,905,721      cycles                           #    1.852 GHz                    
-     3,268,863,177      instructions                     #    1.59  insn per cycle         
-       1.111361855 seconds time elapsed
+TOTAL       :     1.073829 sec
+     2,048,858,820      cycles                           #    1.904 GHz                    
+     3,269,526,835      instructions                     #    1.60  insn per cycle         
+       1.078196054 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
index 7ef08eb1a1..741b2db05e 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:54:33
+DATE: 2023-11-09_18:12:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.630785e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.310772e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.310772e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.661835e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.358766e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.358766e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.466074 sec
-     1,998,575,796      cycles                           #    2.933 GHz                    
-     2,994,965,957      instructions                     #    1.50  insn per cycle         
-       0.738328183 seconds time elapsed
+TOTAL       :     0.465492 sec
+     2,015,187,483      cycles                           #    2.973 GHz                    
+     3,002,049,942      instructions                     #    1.49  insn per cycle         
+       0.734544576 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.261662e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.481805e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.481805e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.271779e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.483162e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.483162e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.641710e+00 +- 4.994249e+00 )  GeV^-4
-TOTAL       :     1.889932 sec
-     6,363,844,307      cycles                           #    2.984 GHz                    
-    13,005,964,068      instructions                     #    2.04  insn per cycle         
-       2.191280597 seconds time elapsed
+TOTAL       :     1.878261 sec
+     6,418,416,087      cycles                           #    3.037 GHz                    
+    13,442,701,753      instructions                     #    2.09  insn per cycle         
+       2.169965161 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.002381e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.003373e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.003373e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.022346e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.023320e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.023320e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.200661 sec
-    24,662,776,052      cycles                           #    3.006 GHz                    
-    78,138,608,532      instructions                     #    3.17  insn per cycle         
-       8.204934256 seconds time elapsed
+TOTAL       :     8.120106 sec
+    24,656,495,142      cycles                           #    3.035 GHz                    
+    78,138,532,268      instructions                     #    3.17  insn per cycle         
+       8.124268827 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.306848e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.320652e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.320652e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.385899e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.400170e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.400170e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.255006 sec
-     6,482,848,456      cycles                           #    2.870 GHz                    
-    20,133,573,977      instructions                     #    3.11  insn per cycle         
-       2.259320427 seconds time elapsed
+TOTAL       :     2.230708 sec
+     6,485,115,953      cycles                           #    2.903 GHz                    
+    20,133,634,822      instructions                     #    3.10  insn per cycle         
+       2.234788671 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.648854e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.655690e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.655690e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.666755e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.673825e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.673825e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.005313 sec
-     2,849,286,060      cycles                           #    2.824 GHz                    
-     7,001,856,779      instructions                     #    2.46  insn per cycle         
-       1.009712120 seconds time elapsed
+TOTAL       :     0.994493 sec
+     2,844,577,237      cycles                           #    2.850 GHz                    
+     7,001,609,472      instructions                     #    2.46  insn per cycle         
+       0.998731395 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.888498e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.898036e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.898036e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.867923e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.876610e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.876610e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.879137 sec
-     2,499,075,063      cycles                           #    2.831 GHz                    
-     6,309,019,763      instructions                     #    2.52  insn per cycle         
-       0.883537991 seconds time elapsed
+TOTAL       :     0.888528 sec
+     2,499,243,226      cycles                           #    2.802 GHz                    
+     6,308,730,841      instructions                     #    2.52  insn per cycle         
+       0.892798888 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.493195e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.498802e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.498802e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.495920e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.501735e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.501735e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.109448 sec
-     2,060,050,205      cycles                           #    1.851 GHz                    
-     3,279,571,633      instructions                     #    1.59  insn per cycle         
-       1.113744599 seconds time elapsed
+TOTAL       :     1.107724 sec
+     2,056,932,102      cycles                           #    1.850 GHz                    
+     3,279,291,488      instructions                     #    1.59  insn per cycle         
+       1.112281401 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
index 4d664fc4d6..341f303aae 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_22:06:09
+DATE: 2023-11-09_18:24:24
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.355118e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.403966e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.409182e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.311526e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.361390e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.366448e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.159397e-01 +- 3.238804e-01 )  GeV^-4
-TOTAL       :     0.462753 sec
-     2,014,223,981      cycles                           #    2.997 GHz                    
-     3,038,538,632      instructions                     #    1.51  insn per cycle         
-       0.729935758 seconds time elapsed
+TOTAL       :     0.464707 sec
+     2,008,078,996      cycles                           #    2.985 GHz                    
+     3,036,723,964      instructions                     #    1.51  insn per cycle         
+       0.732085987 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.565713e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.634653e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.637720e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.547836e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.616999e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.620197e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.094367e+02 +- 1.071509e+02 )  GeV^-4
-TOTAL       :     1.796992 sec
-     6,172,975,790      cycles                           #    3.046 GHz                    
-    13,083,554,495      instructions                     #    2.12  insn per cycle         
-       2.086223865 seconds time elapsed
+TOTAL       :     1.809045 sec
+     6,020,726,960      cycles                           #    2.958 GHz                    
+    11,569,273,710      instructions                     #    1.92  insn per cycle         
+       2.092173630 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.045430e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.046401e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.046401e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.048605e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.049604e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.049604e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208459e-01 +- 3.253446e-01 )  GeV^-4
-TOTAL       :     8.027601 sec
-    24,633,930,216      cycles                           #    3.068 GHz                    
-    78,134,736,063      instructions                     #    3.17  insn per cycle         
-       8.031555788 seconds time elapsed
+TOTAL       :     8.015102 sec
+    24,651,277,493      cycles                           #    3.074 GHz                    
+    78,133,763,667      instructions                     #    3.17  insn per cycle         
+       8.018994302 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.461058e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.474893e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.474893e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.377691e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.391250e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.391250e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208457e-01 +- 3.253445e-01 )  GeV^-4
-TOTAL       :     2.206755 sec
-     6,481,821,994      cycles                           #    2.933 GHz                    
-    20,123,351,594      instructions                     #    3.10  insn per cycle         
-       2.210721958 seconds time elapsed
+TOTAL       :     2.232285 sec
+     6,481,088,653      cycles                           #    2.899 GHz                    
+    20,124,382,938      instructions                     #    3.11  insn per cycle         
+       2.236275849 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.665888e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.672800e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.672800e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.686029e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.693351e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.693351e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     0.994258 sec
-     2,841,630,041      cycles                           #    2.848 GHz                    
-     6,990,811,149      instructions                     #    2.46  insn per cycle         
-       0.998209890 seconds time elapsed
+TOTAL       :     0.981416 sec
+     2,838,446,580      cycles                           #    2.882 GHz                    
+     6,989,000,726      instructions                     #    2.46  insn per cycle         
+       0.985356553 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.891296e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.900721e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.900721e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.921238e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.930307e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.930307e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     0.876906 sec
-     2,495,700,726      cycles                           #    2.835 GHz                    
-     6,297,076,618      instructions                     #    2.52  insn per cycle         
-       0.880948978 seconds time elapsed
+TOTAL       :     0.863261 sec
+     2,495,681,706      cycles                           #    2.880 GHz                    
+     6,297,112,783      instructions                     #    2.52  insn per cycle         
+       0.867346097 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.552086e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.558027e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.558027e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.544822e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.550907e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.550907e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214981e-01 +- 3.255523e-01 )  GeV^-4
-TOTAL       :     1.065622 sec
-     2,049,379,894      cycles                           #    1.917 GHz                    
-     3,265,032,857      instructions                     #    1.59  insn per cycle         
-       1.069477010 seconds time elapsed
+TOTAL       :     1.070627 sec
+     2,048,550,465      cycles                           #    1.908 GHz                    
+     3,265,201,106      instructions                     #    1.59  insn per cycle         
+       1.074629445 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
index ee315233c1..63178ad027 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_22:02:50
+DATE: 2023-11-09_18:21:06
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.328542e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.377951e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.383103e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.362546e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.415600e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.420893e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.460444 sec
-     2,025,388,470      cycles                           #    3.016 GHz                    
-     3,026,490,924      instructions                     #    1.49  insn per cycle         
-       0.728886791 seconds time elapsed
+TOTAL       :     0.460752 sec
+     2,005,673,830      cycles                           #    2.989 GHz                    
+     2,996,841,960      instructions                     #    1.49  insn per cycle         
+       0.729795900 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.561347e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.630349e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.633332e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.567426e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.636917e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.639909e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.742876 sec
-     6,025,656,195      cycles                           #    3.063 GHz                    
-    13,153,972,386      instructions                     #    2.18  insn per cycle         
-       2.023922647 seconds time elapsed
+TOTAL       :     1.748848 sec
+     5,960,134,043      cycles                           #    3.018 GHz                    
+    12,821,096,532      instructions                     #    2.15  insn per cycle         
+       2.031326515 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.049881e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.050905e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.050905e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.057810e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.058811e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.058811e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.008804 sec
-    24,622,379,845      cycles                           #    3.073 GHz                    
-    78,134,077,156      instructions                     #    3.17  insn per cycle         
-       8.012721206 seconds time elapsed
+TOTAL       :     7.977700 sec
+    24,629,048,089      cycles                           #    3.086 GHz                    
+    78,132,914,520      instructions                     #    3.17  insn per cycle         
+       7.981637101 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.445321e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.458917e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.458917e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.439696e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.453635e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.453635e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.210782 sec
-     6,475,852,782      cycles                           #    2.925 GHz                    
-    20,124,175,553      instructions                     #    3.11  insn per cycle         
-       2.214842110 seconds time elapsed
+TOTAL       :     2.212183 sec
+     6,477,339,632      cycles                           #    2.924 GHz                    
+    20,124,428,604      instructions                     #    3.11  insn per cycle         
+       2.216339188 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.697514e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.704851e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.704851e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.594939e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.601395e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.601395e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.973836 sec
-     2,835,149,001      cycles                           #    2.901 GHz                    
-     6,991,410,852      instructions                     #    2.47  insn per cycle         
-       0.977864307 seconds time elapsed
+TOTAL       :     1.036317 sec
+     2,842,114,214      cycles                           #    2.733 GHz                    
+     6,991,999,004      instructions                     #    2.46  insn per cycle         
+       1.040742925 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.934817e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.944385e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.944385e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.922697e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.931896e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.931896e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.855140 sec
-     2,487,419,693      cycles                           #    2.897 GHz                    
-     6,298,706,089      instructions                     #    2.53  insn per cycle         
-       0.859052723 seconds time elapsed
+TOTAL       :     0.860619 sec
+     2,490,053,798      cycles                           #    2.883 GHz                    
+     6,298,956,842      instructions                     #    2.53  insn per cycle         
+       0.864591382 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.555511e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.561377e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.561377e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.526848e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.532542e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.532542e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.062258 sec
-     2,048,558,209      cycles                           #    1.923 GHz                    
-     3,268,764,234      instructions                     #    1.60  insn per cycle         
-       1.066272803 seconds time elapsed
+TOTAL       :     1.082468 sec
+     2,049,657,294      cycles                           #    1.888 GHz                    
+     3,269,097,732      instructions                     #    1.59  insn per cycle         
+       1.086487061 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
index efdbcfe1ae..2548057249 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:59:38
+DATE: 2023-11-09_18:17:53
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.758974e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.368878e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.373916e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.733376e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.369757e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.375069e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.462598 sec
-     2,002,083,704      cycles                           #    2.975 GHz                    
-     3,028,559,110      instructions                     #    1.51  insn per cycle         
-       0.730010364 seconds time elapsed
+TOTAL       :     0.461931 sec
+     2,018,026,618      cycles                           #    3.001 GHz                    
+     3,012,517,263      instructions                     #    1.49  insn per cycle         
+       0.729223266 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.506677e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.634226e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.637242e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.494168e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.614081e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.617046e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.641710e+00 +- 4.994249e+00 )  GeV^-4
-TOTAL       :     1.818599 sec
-     6,254,092,117      cycles                           #    3.058 GHz                    
-    12,631,559,563      instructions                     #    2.02  insn per cycle         
-       2.110653596 seconds time elapsed
+TOTAL       :     1.820293 sec
+     6,248,771,087      cycles                           #    3.054 GHz                    
+    13,452,131,003      instructions                     #    2.15  insn per cycle         
+       2.111868581 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -94,14 +94,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.065897e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.066912e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.066912e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.046417e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.047425e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.047425e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     7.946634 sec
-    24,618,185,681      cycles                           #    3.097 GHz                    
-    78,133,594,453      instructions                     #    3.17  insn per cycle         
-       7.950536612 seconds time elapsed
+TOTAL       :     8.022116 sec
+    24,641,165,344      cycles                           #    3.070 GHz                    
+    78,133,947,271      instructions                     #    3.17  insn per cycle         
+       8.026095295 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe
@@ -121,14 +121,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.469422e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.483642e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.483642e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.418320e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.431601e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.431601e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.203521 sec
-     6,477,304,059      cycles                           #    2.935 GHz                    
-    20,124,231,259      instructions                     #    3.11  insn per cycle         
-       2.207560981 seconds time elapsed
+TOTAL       :     2.218818 sec
+     6,476,858,939      cycles                           #    2.915 GHz                    
+    20,124,080,031      instructions                     #    3.11  insn per cycle         
+       2.222978465 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13763) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -148,14 +148,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.692268e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.699231e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.699231e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.673631e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.680333e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.680333e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.976738 sec
-     2,836,504,426      cycles                           #    2.894 GHz                    
-     6,991,415,909      instructions                     #    2.46  insn per cycle         
-       0.980720950 seconds time elapsed
+TOTAL       :     0.987621 sec
+     2,839,487,470      cycles                           #    2.865 GHz                    
+     6,991,564,753      instructions                     #    2.46  insn per cycle         
+       0.991606693 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11874) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -175,14 +175,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.804638e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.812886e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.812886e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.883610e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.892494e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.892494e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.917702 sec
-     2,493,684,467      cycles                           #    2.707 GHz                    
-     6,299,926,195      instructions                     #    2.53  insn per cycle         
-       0.922017124 seconds time elapsed
+TOTAL       :     0.878347 sec
+     2,488,399,526      cycles                           #    2.822 GHz                    
+     6,298,882,599      instructions                     #    2.53  insn per cycle         
+       0.882234875 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10822) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe
@@ -202,14 +202,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.542695e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.548647e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.548647e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.534627e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.540393e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.540393e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.070949 sec
-     2,049,167,689      cycles                           #    1.907 GHz                    
-     3,268,610,487      instructions                     #    1.60  insn per cycle         
-       1.074921168 seconds time elapsed
+TOTAL       :     1.076545 sec
+     2,047,724,498      cycles                           #    1.897 GHz                    
+     3,268,770,442      instructions                     #    1.60  insn per cycle         
+       1.080450235 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2415) (512y:   46) (512z: 9571)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
index afc8dc6250..3e46ada377 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:24:23
+DATE: 2023-11-09_17:45:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.334864e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.384627e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.390200e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.305671e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.350688e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.358120e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.478485 sec
-     2,037,012,252      cycles                           #    2.938 GHz                    
-     3,030,553,414      instructions                     #    1.49  insn per cycle         
-       0.751162438 seconds time elapsed
+TOTAL       :     0.483459 sec
+     2,029,909,968      cycles                           #    2.855 GHz                    
+     2,962,980,745      instructions                     #    1.46  insn per cycle         
+       0.768081643 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.576633e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.638822e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.641657e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.574581e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.636147e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.638743e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.723660 sec
-     5,841,021,027      cycles                           #    2.992 GHz                    
-    11,140,232,262      instructions                     #    1.91  insn per cycle         
-       2.010396879 seconds time elapsed
+TOTAL       :     1.716632 sec
+     5,921,965,881      cycles                           #    3.044 GHz                    
+    11,852,981,757      instructions                     #    2.00  insn per cycle         
+       2.001901523 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.020250e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.021294e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.021294e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.062728e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.063773e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.063773e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.126388 sec
-    24,531,986,763      cycles                           #    3.018 GHz                    
-    77,860,700,825      instructions                     #    3.17  insn per cycle         
-       8.130365170 seconds time elapsed
+TOTAL       :     7.958572 sec
+    24,559,190,224      cycles                           #    3.085 GHz                    
+    77,859,989,303      instructions                     #    3.17  insn per cycle         
+       7.962642501 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3113) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.508420e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.523945e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.523945e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.583566e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.598037e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.598037e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.192196 sec
-     6,417,749,314      cycles                           #    2.923 GHz                    
-    20,089,444,717      instructions                     #    3.13  insn per cycle         
-       2.196603069 seconds time elapsed
+TOTAL       :     2.170856 sec
+     6,426,627,449      cycles                           #    2.956 GHz                    
+    20,090,039,565      instructions                     #    3.13  insn per cycle         
+       2.175014616 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13452) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.619246e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.625936e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.625936e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.591188e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.597484e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.597484e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.020667 sec
-     2,904,857,639      cycles                           #    2.836 GHz                    
-     7,133,491,112      instructions                     #    2.46  insn per cycle         
-       1.024733034 seconds time elapsed
+TOTAL       :     1.038604 sec
+     2,902,688,212      cycles                           #    2.785 GHz                    
+     7,133,529,057      instructions                     #    2.46  insn per cycle         
+       1.042821386 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:12261) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.807219e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.815471e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.815471e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.840190e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.848739e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.848739e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.915311 sec
-     2,597,440,177      cycles                           #    2.827 GHz                    
-     6,442,073,160      instructions                     #    2.48  insn per cycle         
-       0.919440444 seconds time elapsed
+TOTAL       :     0.898885 sec
+     2,595,883,470      cycles                           #    2.877 GHz                    
+     6,441,979,586      instructions                     #    2.48  insn per cycle         
+       0.902832877 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11276) (512y:   27) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.330502e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.335014e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.335014e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.492137e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.497778e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.497778e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.241025 sec
-     2,122,770,451      cycles                           #    1.706 GHz                    
-     3,430,866,539      instructions                     #    1.62  insn per cycle         
-       1.245371552 seconds time elapsed
+TOTAL       :     1.106744 sec
+     2,123,250,955      cycles                           #    1.918 GHz                    
+     3,431,574,417      instructions                     #    1.62  insn per cycle         
+       1.110853762 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2912) (512y:   22) (512z: 9647)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
index 86542f0b70..764181f824 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:45:34
+DATE: 2023-11-09_18:03:57
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.570490e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.610069e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.614296e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.601175e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.638676e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.643535e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.491125 sec
-     2,098,886,797      cycles                           #    2.948 GHz                    
-     3,121,764,784      instructions                     #    1.49  insn per cycle         
-       0.773983413 seconds time elapsed
+TOTAL       :     0.484907 sec
+     2,101,276,759      cycles                           #    2.981 GHz                    
+     3,149,706,582      instructions                     #    1.50  insn per cycle         
+       0.766736785 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.716470e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.775515e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.778049e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.695736e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.752372e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.754868e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.856510 sec
-     6,241,842,396      cycles                           #    2.982 GHz                    
-    13,362,161,836      instructions                     #    2.14  insn per cycle         
-       2.150637345 seconds time elapsed
+TOTAL       :     1.853224 sec
+     6,303,125,801      cycles                           #    3.016 GHz                    
+    12,982,819,660      instructions                     #    2.06  insn per cycle         
+       2.146815240 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.736455e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.737287e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.737287e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.841033e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.841866e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.841866e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    28.600113 sec
-    86,425,718,035      cycles                           #    3.022 GHz                    
-   135,574,556,258      instructions                     #    1.57  insn per cycle         
-      28.604413837 seconds time elapsed
+TOTAL       :    28.085663 sec
+    86,167,672,431      cycles                           #    3.068 GHz                    
+   135,565,357,772      instructions                     #    1.57  insn per cycle         
+      28.089696347 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:15486) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.030289e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.043211e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.043211e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.152037e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.164422e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.164422e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.341197 sec
-     6,779,953,097      cycles                           #    2.892 GHz                    
-    19,387,529,866      instructions                     #    2.86  insn per cycle         
-       2.345543121 seconds time elapsed
+TOTAL       :     2.302124 sec
+     6,785,316,910      cycles                           #    2.944 GHz                    
+    19,388,398,647      instructions                     #    2.86  insn per cycle         
+       2.306338036 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:69680) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.479111e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.484786e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.484786e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.500496e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.506041e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.506041e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     1.117197 sec
-     3,179,595,887      cycles                           #    2.837 GHz                    
-     6,808,760,792      instructions                     #    2.14  insn per cycle         
-       1.121370768 seconds time elapsed
+TOTAL       :     1.100781 sec
+     3,177,227,261      cycles                           #    2.877 GHz                    
+     6,808,813,623      instructions                     #    2.14  insn per cycle         
+       1.104867562 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:49077) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.783416e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.791440e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.791440e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.797362e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.805452e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.805452e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     0.927417 sec
-     2,649,120,857      cycles                           #    2.846 GHz                    
-     5,987,099,017      instructions                     #    2.26  insn per cycle         
-       0.931540821 seconds time elapsed
+TOTAL       :     0.920545 sec
+     2,652,149,170      cycles                           #    2.870 GHz                    
+     5,986,924,086      instructions                     #    2.26  insn per cycle         
+       0.924698406 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:42677) (512y:   11) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.490502e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.495988e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.495988e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.476030e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.481355e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.481355e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060904e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.108557 sec
-     2,075,562,698      cycles                           #    1.867 GHz                    
-     3,501,563,321      instructions                     #    1.69  insn per cycle         
-       1.112823809 seconds time elapsed
+TOTAL       :     1.119541 sec
+     2,077,679,044      cycles                           #    1.851 GHz                    
+     3,501,921,791      instructions                     #    1.69  insn per cycle         
+       1.123804705 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 5198) (512y:    3) (512z:44822)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
index 4737cdf8e3..7b7c373ccc 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:46:27
+DATE: 2023-11-09_18:04:48
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.528505e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.572699e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.577185e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.541471e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.579175e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.583358e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.485528 sec
-     2,086,161,680      cycles                           #    2.950 GHz                    
-     3,149,356,396      instructions                     #    1.51  insn per cycle         
-       0.766853446 seconds time elapsed
+TOTAL       :     0.484837 sec
+     2,105,287,248      cycles                           #    2.990 GHz                    
+     3,132,361,933      instructions                     #    1.49  insn per cycle         
+       0.765772342 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.640879e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.699452e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.702171e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.694480e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.751016e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.753615e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.863196 sec
-     6,301,645,470      cycles                           #    3.002 GHz                    
-    12,163,417,933      instructions                     #    1.93  insn per cycle         
-       2.157068829 seconds time elapsed
+TOTAL       :     1.853183 sec
+     6,341,276,975      cycles                           #    3.036 GHz                    
+    13,434,801,047      instructions                     #    2.12  insn per cycle         
+       2.144878674 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.763152e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.763992e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.763992e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.834166e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.834994e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.834994e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    28.466782 sec
-    86,160,161,464      cycles                           #    3.027 GHz                    
-   135,907,402,983      instructions                     #    1.58  insn per cycle         
-      28.470931551 seconds time elapsed
+TOTAL       :    28.118822 sec
+    86,081,697,198      cycles                           #    3.062 GHz                    
+   135,906,074,576      instructions                     #    1.58  insn per cycle         
+      28.122852922 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:15910) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.954712e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.967174e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.967174e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.132688e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.145964e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.145964e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.366132 sec
-     6,848,483,827      cycles                           #    2.890 GHz                    
-    19,440,750,063      instructions                     #    2.84  insn per cycle         
-       2.370332980 seconds time elapsed
+TOTAL       :     2.306989 sec
+     6,845,463,882      cycles                           #    2.963 GHz                    
+    19,440,308,006      instructions                     #    2.84  insn per cycle         
+       2.311118522 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:69722) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.511072e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.516863e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.516863e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.544215e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.549994e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.549994e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     1.093285 sec
-     3,106,954,835      cycles                           #    2.833 GHz                    
-     6,720,019,206      instructions                     #    2.16  insn per cycle         
-       1.097556495 seconds time elapsed
+TOTAL       :     1.069611 sec
+     3,120,065,313      cycles                           #    2.908 GHz                    
+     6,719,636,670      instructions                     #    2.15  insn per cycle         
+       1.073683656 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:47667) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.791720e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.799978e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.799978e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.829756e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.837937e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.837937e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367376e+00 )  GeV^-4
-TOTAL       :     0.924560 sec
-     2,625,881,689      cycles                           #    2.831 GHz                    
-     5,970,468,600      instructions                     #    2.27  insn per cycle         
-       0.928699193 seconds time elapsed
+TOTAL       :     0.904097 sec
+     2,625,695,846      cycles                           #    2.892 GHz                    
+     5,970,269,399      instructions                     #    2.27  insn per cycle         
+       0.908318447 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:41842) (512y:   13) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.485772e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.491338e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.491338e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.517896e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.523661e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.523661e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060904e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.112143 sec
-     2,079,682,688      cycles                           #    1.864 GHz                    
-     3,494,926,799      instructions                     #    1.68  insn per cycle         
-       1.116310984 seconds time elapsed
+TOTAL       :     1.088404 sec
+     2,079,379,564      cycles                           #    1.905 GHz                    
+     3,494,888,851      instructions                     #    1.68  insn per cycle         
+       1.092417864 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 4162) (512y:    4) (512z:44465)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index 0d88057431..93a0b75f12 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:24:52
+DATE: 2023-11-09_17:46:11
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.461953e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.486921e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.488984e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.470867e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.494695e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.496793e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.524274 sec
-     2,213,988,684      cycles                           #    2.939 GHz                    
-     3,460,274,141      instructions                     #    1.56  insn per cycle         
-       0.814878779 seconds time elapsed
+TOTAL       :     0.522613 sec
+     2,231,403,620      cycles                           #    2.972 GHz                    
+     3,427,736,994      instructions                     #    1.54  insn per cycle         
+       0.812895260 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.131317e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.159899e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.161114e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.127962e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.155846e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.156998e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.024560 sec
-     9,783,019,983      cycles                           #    2.986 GHz                    
-    21,052,355,005      instructions                     #    2.15  insn per cycle         
-       3.333798384 seconds time elapsed
+TOTAL       :     3.024603 sec
+    10,040,286,484      cycles                           #    3.065 GHz                    
+    20,701,312,854      instructions                     #    2.06  insn per cycle         
+       3.332453984 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.908400e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.909295e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.909295e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.954833e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.955774e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.955774e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.603127 sec
-    25,922,951,314      cycles                           #    3.012 GHz                    
-    79,444,287,848      instructions                     #    3.06  insn per cycle         
-       8.607377110 seconds time elapsed
+TOTAL       :     8.399305 sec
+    25,922,061,106      cycles                           #    3.085 GHz                    
+    79,443,494,538      instructions                     #    3.06  insn per cycle         
+       8.403427486 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4857) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.601676e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.605199e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.605199e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.761504e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.765123e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.765123e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.563626 sec
-    12,670,494,381      cycles                           #    2.774 GHz                    
-    38,555,115,428      instructions                     #    3.04  insn per cycle         
-       4.567958025 seconds time elapsed
+TOTAL       :     4.369308 sec
+    12,659,894,478      cycles                           #    2.895 GHz                    
+    38,554,080,405      instructions                     #    3.05  insn per cycle         
+       4.373596593 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:13161) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.436133e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.453065e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.453065e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.648175e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.665781e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.665781e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.953575 sec
-     5,515,640,809      cycles                           #    2.818 GHz                    
-    13,484,131,277      instructions                     #    2.44  insn per cycle         
-       1.957940467 seconds time elapsed
+TOTAL       :     1.905268 sec
+     5,516,001,376      cycles                           #    2.890 GHz                    
+    13,483,921,346      instructions                     #    2.44  insn per cycle         
+       1.909531551 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11242) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.530089e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.553433e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.553433e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.803935e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.827738e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.827738e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.730211 sec
-     4,882,100,767      cycles                           #    2.816 GHz                    
-    12,140,913,078      instructions                     #    2.49  insn per cycle         
-       1.734496344 seconds time elapsed
+TOTAL       :     1.682277 sec
+     4,871,353,432      cycles                           #    2.890 GHz                    
+    12,140,803,788      instructions                     #    2.49  insn per cycle         
+       1.686455915 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10154) (512y:   79) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.332978e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.346275e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.346275e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.374652e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.387771e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.387771e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.246181 sec
-     4,144,338,295      cycles                           #    1.842 GHz                    
-     6,339,235,304      instructions                     #    1.53  insn per cycle         
-       2.250536993 seconds time elapsed
+TOTAL       :     2.233464 sec
+     4,145,054,475      cycles                           #    1.853 GHz                    
+     6,339,255,297      instructions                     #    1.53  insn per cycle         
+       2.237809120 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1802) (512y:   93) (512z: 9358)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
index 154c33870f..5c4ca592f3 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2023-11-08_21:25:29
+DATE: 2023-11-09_17:46:47
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.466139e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.491413e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.493568e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.487617e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.512149e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.514706e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.523033 sec
-     2,231,792,351      cycles                           #    2.947 GHz                    
-     3,493,743,246      instructions                     #    1.57  insn per cycle         
-       0.817222718 seconds time elapsed
+TOTAL       :     0.518884 sec
+     2,241,934,817      cycles                           #    2.999 GHz                    
+     3,518,298,272      instructions                     #    1.57  insn per cycle         
+       0.808606683 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.134865e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.163582e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.164827e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.131184e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.159088e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.160252e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.022684 sec
-     9,525,982,822      cycles                           #    2.907 GHz                    
-    21,759,904,749      instructions                     #    2.28  insn per cycle         
-       3.333718015 seconds time elapsed
+TOTAL       :     3.016798 sec
+    10,040,228,896      cycles                           #    3.072 GHz                    
+    22,037,859,926      instructions                     #    2.19  insn per cycle         
+       3.324922224 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.890125e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.891036e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.891036e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.950722e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.951656e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.951656e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.687196 sec
-    25,936,497,205      cycles                           #    2.985 GHz                    
-    79,455,431,598      instructions                     #    3.06  insn per cycle         
-       8.691442955 seconds time elapsed
+TOTAL       :     8.416183 sec
+    25,916,224,646      cycles                           #    3.078 GHz                    
+    79,453,865,963      instructions                     #    3.07  insn per cycle         
+       8.420247127 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 4504) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.674580e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.678053e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.678053e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.759672e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.763188e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.763188e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.473580 sec
-    12,663,684,829      cycles                           #    2.829 GHz                    
-    38,526,072,859      instructions                     #    3.04  insn per cycle         
-       4.477928329 seconds time elapsed
+TOTAL       :     4.371398 sec
+    12,639,801,464      cycles                           #    2.889 GHz                    
+    38,524,761,271      instructions                     #    3.05  insn per cycle         
+       4.375560053 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:12928) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.447225e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.464376e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.464376e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.630529e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.648410e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.648410e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.950551 sec
-     5,554,043,311      cycles                           #    2.842 GHz                    
-    13,609,444,575      instructions                     #    2.45  insn per cycle         
-       1.954818500 seconds time elapsed
+TOTAL       :     1.909613 sec
+     5,559,227,570      cycles                           #    2.906 GHz                    
+    13,609,303,550      instructions                     #    2.45  insn per cycle         
+       1.913823155 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:11327) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.528912e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.551046e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.551046e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.332740e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.353313e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.353313e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.730043 sec
-     4,918,299,350      cycles                           #    2.837 GHz                    
-    12,276,281,852      instructions                     #    2.50  insn per cycle         
-       1.734286887 seconds time elapsed
+TOTAL       :     1.766447 sec
+     4,917,170,589      cycles                           #    2.778 GHz                    
+    12,276,136,667      instructions                     #    2.50  insn per cycle         
+       1.770689432 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:10143) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.227160e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.239598e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.239598e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.605174e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.618655e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.618655e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.278650 sec
-     4,148,690,065      cycles                           #    1.818 GHz                    
-     6,446,007,726      instructions                     #    1.55  insn per cycle         
-       2.282996103 seconds time elapsed
+TOTAL       :     2.166306 sec
+     4,144,641,386      cycles                           #    1.911 GHz                    
+     6,445,298,096      instructions                     #    1.56  insn per cycle         
+       2.170508580 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1627) (512y:  191) (512z: 9356)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index f7c4424904..b73b517066 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-08_21:27:51
+DATE: 2023-11-09_17:49:06
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.070515e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.070905e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.071008e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.070656e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.071067e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.071174e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.420963 sec
-     8,223,258,722      cycles                           #    3.000 GHz                    
-    17,670,197,130      instructions                     #    2.15  insn per cycle         
-       2.797812392 seconds time elapsed
+TOTAL       :     2.421343 sec
+     8,332,807,450      cycles                           #    3.040 GHz                    
+    16,939,230,243      instructions                     #    2.03  insn per cycle         
+       2.799270804 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.267469e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.269461e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.269740e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.271200e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.273122e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.273304e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.983357 sec
-    12,890,762,548      cycles                           #    2.986 GHz                    
-    28,149,713,448      instructions                     #    2.18  insn per cycle         
-       4.374511500 seconds time elapsed
+TOTAL       :     3.985063 sec
+    13,247,174,015      cycles                           #    3.069 GHz                    
+    30,019,215,878      instructions                     #    2.27  insn per cycle         
+       4.374841890 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.327736e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.327962e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.327962e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.228283e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.228511e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.228511e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.345395 sec
-    18,808,426,055      cycles                           #    2.963 GHz                    
-    53,915,859,593      instructions                     #    2.87  insn per cycle         
-       6.349306785 seconds time elapsed
+TOTAL       :     6.424546 sec
+    18,798,364,918      cycles                           #    2.925 GHz                    
+    53,916,162,526      instructions                     #    2.87  insn per cycle         
+       6.428517349 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32447) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.631387e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.631477e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.631477e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.657858e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.657947e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.657947e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.247242 sec
-     9,798,431,936      cycles                           #    3.015 GHz                    
-    27,093,078,884      instructions                     #    2.77  insn per cycle         
-       3.251306892 seconds time elapsed
+TOTAL       :     3.191098 sec
+     9,844,225,763      cycles                           #    3.082 GHz                    
+    27,092,778,504      instructions                     #    2.75  insn per cycle         
+       3.195159677 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96441) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.527269e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.527671e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.527671e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.638511e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.638939e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.638939e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.502062 sec
-     4,254,510,227      cycles                           #    2.826 GHz                    
-     9,561,365,042      instructions                     #    2.25  insn per cycle         
-       1.506086006 seconds time elapsed
+TOTAL       :     1.457101 sec
+     4,229,207,978      cycles                           #    2.896 GHz                    
+     9,561,222,824      instructions                     #    2.26  insn per cycle         
+       1.461220413 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84390) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.044745e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.045315e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.045315e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.119963e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.120507e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.120507e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.310362 sec
-     3,714,842,589      cycles                           #    2.828 GHz                    
-     8,485,417,237      instructions                     #    2.28  insn per cycle         
-       1.314439582 seconds time elapsed
+TOTAL       :     1.286739 sec
+     3,714,427,423      cycles                           #    2.879 GHz                    
+     8,485,272,385      instructions                     #    2.28  insn per cycle         
+       1.290826596 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79991) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.650927e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.651448e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.651448e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.600399e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.600911e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.600911e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.452786 sec
-     2,695,403,304      cycles                           #    1.852 GHz                    
-     4,273,125,151      instructions                     #    1.59  insn per cycle         
-       1.456779010 seconds time elapsed
+TOTAL       :     1.474924 sec
+     2,695,875,361      cycles                           #    1.824 GHz                    
+     4,273,169,567      instructions                     #    1.59  insn per cycle         
+       1.479057981 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2284) (512y:  105) (512z:79105)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
index f73b319e4d..28081b2160 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-08_21:55:03
+DATE: 2023-11-09_18:13:18
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.070004e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.071005e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.071005e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.064318e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.065254e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.065254e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.374046 sec
-     8,061,206,235      cycles                           #    2.993 GHz                    
-    17,860,181,288      instructions                     #    2.22  insn per cycle         
-       2.750065172 seconds time elapsed
+TOTAL       :     2.361712 sec
+     8,164,385,199      cycles                           #    3.041 GHz                    
+    16,942,565,052      instructions                     #    2.08  insn per cycle         
+       2.743176660 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.226901e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.259810e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.259810e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.190361e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.223459e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.223459e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.996223 sec
-    12,903,615,719      cycles                           #    2.989 GHz                    
-    27,064,646,353      instructions                     #    2.10  insn per cycle         
-       4.375939404 seconds time elapsed
+TOTAL       :     3.988233 sec
+    13,123,079,634      cycles                           #    3.036 GHz                    
+    28,841,455,416      instructions                     #    2.20  insn per cycle         
+       4.378479494 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.320809e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.321082e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.321082e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.307342e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.307565e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.307565e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.351015 sec
-    18,895,432,596      cycles                           #    2.975 GHz                    
-    53,920,363,469      instructions                     #    2.85  insn per cycle         
-       6.355030283 seconds time elapsed
+TOTAL       :     6.364568 sec
+    18,927,213,544      cycles                           #    2.973 GHz                    
+    53,918,164,087      instructions                     #    2.85  insn per cycle         
+       6.368577598 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32447) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.632581e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.632679e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.632679e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.666025e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.666114e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.666114e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.239771 sec
-     9,805,010,159      cycles                           #    3.023 GHz                    
-    27,094,031,310      instructions                     #    2.76  insn per cycle         
-       3.243901475 seconds time elapsed
+TOTAL       :     3.173683 sec
+     9,797,609,023      cycles                           #    3.084 GHz                    
+    27,093,782,808      instructions                     #    2.77  insn per cycle         
+       3.177702749 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96441) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.542776e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.543249e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.543249e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.255906e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.256265e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.256265e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.496243 sec
-     4,233,173,830      cycles                           #    2.823 GHz                    
-     9,562,510,318      instructions                     #    2.26  insn per cycle         
-       1.500255263 seconds time elapsed
+TOTAL       :     1.626420 sec
+     4,592,212,308      cycles                           #    2.818 GHz                    
+     9,562,781,549      instructions                     #    2.08  insn per cycle         
+       1.630448393 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84390) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.008828e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.009454e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.009454e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.133405e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.134023e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.134023e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.322886 sec
-     3,744,251,192      cycles                           #    2.823 GHz                    
-     8,486,441,130      instructions                     #    2.27  insn per cycle         
-       1.326937869 seconds time elapsed
+TOTAL       :     1.281959 sec
+     3,704,600,058      cycles                           #    2.882 GHz                    
+     8,486,385,133      instructions                     #    2.29  insn per cycle         
+       1.285885098 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79991) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.594601e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.595186e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.595186e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.663239e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.663889e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.663889e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.474018 sec
-     2,696,155,761      cycles                           #    1.825 GHz                    
-     4,274,155,931      instructions                     #    1.59  insn per cycle         
-       1.478064357 seconds time elapsed
+TOTAL       :     1.446962 sec
+     2,696,700,654      cycles                           #    1.860 GHz                    
+     4,274,559,971      instructions                     #    1.59  insn per cycle         
+       1.451147700 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2284) (512y:  105) (512z:79105)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
index 7a2b2c0da9..4570a77a9f 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-08_21:28:55
+DATE: 2023-11-09_17:50:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.069743e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.070108e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.070239e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.067332e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.067722e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.067853e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.423143 sec
-     8,082,723,641      cycles                           #    2.933 GHz                    
-    18,147,438,278      instructions                     #    2.25  insn per cycle         
-       2.812330272 seconds time elapsed
+TOTAL       :     2.421722 sec
+     8,395,936,189      cycles                           #    3.053 GHz                    
+    18,623,375,460      instructions                     #    2.22  insn per cycle         
+       2.807666336 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.271955e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.273887e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.274124e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.274592e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.276551e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.276737e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.988843 sec
-    13,001,327,656      cycles                           #    3.014 GHz                    
-    27,551,753,777      instructions                     #    2.12  insn per cycle         
-       4.370037996 seconds time elapsed
+TOTAL       :     3.997048 sec
+    13,290,560,155      cycles                           #    3.077 GHz                    
+    29,230,575,077      instructions                     #    2.20  insn per cycle         
+       4.378333342 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.093188e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.093423e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.093423e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.641666e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.641939e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.641939e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.521355 sec
-    18,798,207,330      cycles                           #    2.882 GHz                    
-    53,926,908,182      instructions                     #    2.87  insn per cycle         
-       6.525452544 seconds time elapsed
+TOTAL       :     6.117499 sec
+    18,785,945,280      cycles                           #    3.070 GHz                    
+    53,927,524,861      instructions                     #    2.87  insn per cycle         
+       6.121375903 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32062) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.629486e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.629575e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.629575e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.649159e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.649256e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.649256e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.245853 sec
-     9,848,079,716      cycles                           #    3.031 GHz                    
-    27,090,265,030      instructions                     #    2.75  insn per cycle         
-       3.250037477 seconds time elapsed
+TOTAL       :     3.206425 sec
+     9,787,082,067      cycles                           #    3.050 GHz                    
+    27,089,817,225      instructions                     #    2.77  insn per cycle         
+       3.210577008 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96284) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.490286e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.490752e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.490752e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.558533e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.558987e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.558987e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.517008 sec
-     4,257,648,545      cycles                           #    2.800 GHz                    
-     9,561,344,255      instructions                     #    2.25  insn per cycle         
-       1.521285854 seconds time elapsed
+TOTAL       :     1.489258 sec
+     4,261,284,391      cycles                           #    2.855 GHz                    
+     9,561,306,757      instructions                     #    2.24  insn per cycle         
+       1.493274617 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84478) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.021344e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.021901e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.021901e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.116449e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.116994e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.116994e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.319133 sec
-     3,701,318,743      cycles                           #    2.798 GHz                    
-     8,485,189,781      instructions                     #    2.29  insn per cycle         
-       1.323286884 seconds time elapsed
+TOTAL       :     1.287600 sec
+     3,697,517,464      cycles                           #    2.864 GHz                    
+     8,485,532,294      instructions                     #    2.29  insn per cycle         
+       1.291548783 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:80014) (512y:  241) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.378941e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.379448e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.379448e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.666755e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.667279e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.667279e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.566088 sec
-     2,698,066,709      cycles                           #    1.719 GHz                    
-     4,276,879,461      instructions                     #    1.59  insn per cycle         
-       1.570153625 seconds time elapsed
+TOTAL       :     1.444368 sec
+     2,694,896,725      cycles                           #    1.862 GHz                    
+     4,276,159,790      instructions                     #    1.59  insn per cycle         
+       1.448419547 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2169) (512y:  187) (512z:79110)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index f4e838f103..4a0d02936a 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-08_21:29:59
+DATE: 2023-11-09_17:51:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.755384e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.756376e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.756775e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.745896e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.746749e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.746990e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.659165 sec
-     5,717,115,891      cycles                           #    2.955 GHz                    
-    12,190,075,892      instructions                     #    2.13  insn per cycle         
-       1.991284959 seconds time elapsed
+TOTAL       :     1.657612 sec
+     5,852,337,885      cycles                           #    3.029 GHz                    
+    12,128,434,322      instructions                     #    2.07  insn per cycle         
+       1.989363075 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.328819e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.329492e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.329584e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.334998e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.335676e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.335767e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333435e-05 )  GeV^-6
-TOTAL       :     1.928671 sec
-     6,641,955,934      cycles                           #    3.003 GHz                    
-    14,330,947,638      instructions                     #    2.16  insn per cycle         
-       2.270510678 seconds time elapsed
+TOTAL       :     1.921239 sec
+     6,689,269,410      cycles                           #    3.045 GHz                    
+    13,766,829,986      instructions                     #    2.06  insn per cycle         
+       2.253627777 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.903818e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.904090e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.904090e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.077848e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.078128e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.078128e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.935151 sec
-    17,988,960,616      cycles                           #    3.029 GHz                    
-    53,590,161,611      instructions                     #    2.98  insn per cycle         
-       5.939109392 seconds time elapsed
+TOTAL       :     5.821316 sec
+    17,888,760,787      cycles                           #    3.072 GHz                    
+    53,591,267,283      instructions                     #    3.00  insn per cycle         
+       5.825272234 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:20207) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.520103e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.520628e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.520628e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.576360e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.576807e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.576807e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.505890 sec
-     4,563,568,647      cycles                           #    3.024 GHz                    
-    13,762,453,321      instructions                     #    3.02  insn per cycle         
-       1.509910484 seconds time elapsed
+TOTAL       :     1.480982 sec
+     4,560,162,627      cycles                           #    3.072 GHz                    
+    13,762,313,674      instructions                     #    3.02  insn per cycle         
+       1.485020552 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96986) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.038019e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.039763e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.039763e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.154943e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.156669e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.156669e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.756034 sec
-     2,141,156,270      cycles                           #    2.820 GHz                    
-     4,816,859,984      instructions                     #    2.25  insn per cycle         
-       0.760083736 seconds time elapsed
+TOTAL       :     0.743454 sec
+     2,138,545,582      cycles                           #    2.865 GHz                    
+     4,816,682,793      instructions                     #    2.25  insn per cycle         
+       0.747370846 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84904) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.079503e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.081743e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.081743e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.228374e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.230533e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.230533e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.658883 sec
-     1,871,387,054      cycles                           #    2.825 GHz                    
-     4,273,792,692      instructions                     #    2.28  insn per cycle         
-       0.663026186 seconds time elapsed
+TOTAL       :     0.646748 sec
+     1,869,005,080      cycles                           #    2.875 GHz                    
+     4,273,904,960      instructions                     #    2.29  insn per cycle         
+       0.650625419 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:80610) (512y:   46) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.037980e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.040224e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.040224e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.373581e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.376135e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.376135e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.756823 sec
-     1,355,166,582      cycles                           #    1.782 GHz                    
-     2,158,764,056      instructions                     #    1.59  insn per cycle         
-       0.760952708 seconds time elapsed
+TOTAL       :     0.721971 sec
+     1,354,973,724      cycles                           #    1.868 GHz                    
+     2,158,504,507      instructions                     #    1.59  insn per cycle         
+       0.726042839 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2878) (512y:   49) (512z:79298)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
index 6fa929f5b1..b3edd3819c 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-08_21:56:06
+DATE: 2023-11-09_18:14:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.804869e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.806749e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.806749e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.797007e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.798750e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.798750e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187094e-05 +- 9.825664e-06 )  GeV^-6
-TOTAL       :     1.602285 sec
-     5,612,741,884      cycles                           #    2.994 GHz                    
-    11,823,721,041      instructions                     #    2.11  insn per cycle         
-       1.932057655 seconds time elapsed
+TOTAL       :     1.595659 sec
+     5,717,240,119      cycles                           #    3.061 GHz                    
+    12,288,497,969      instructions                     #    2.15  insn per cycle         
+       1.924944467 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.321250e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.334433e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.334433e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.290056e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.302765e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.302765e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856441e-04 +- 8.331096e-05 )  GeV^-6
-TOTAL       :     1.875073 sec
-     6,423,111,015      cycles                           #    2.987 GHz                    
-    14,218,262,182      instructions                     #    2.21  insn per cycle         
-       2.206850504 seconds time elapsed
+TOTAL       :     1.886551 sec
+     6,639,132,324      cycles                           #    3.056 GHz                    
+    14,322,788,387      instructions                     #    2.16  insn per cycle         
+       2.229781396 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.905231e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.905509e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.905509e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.171261e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.171565e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.171565e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.938687 sec
-    17,836,764,476      cycles                           #    3.002 GHz                    
-    53,590,153,759      instructions                     #    3.00  insn per cycle         
-       5.942639449 seconds time elapsed
+TOTAL       :     5.764943 sec
+    17,824,241,728      cycles                           #    3.090 GHz                    
+    53,589,840,001      instructions                     #    3.01  insn per cycle         
+       5.768783827 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:20207) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.489420e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.489830e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.489830e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.577193e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.577612e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.577612e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.517580 sec
-     4,611,683,817      cycles                           #    3.032 GHz                    
-    13,763,345,896      instructions                     #    2.98  insn per cycle         
-       1.521625428 seconds time elapsed
+TOTAL       :     1.481390 sec
+     4,567,533,848      cycles                           #    3.077 GHz                    
+    13,763,213,169      instructions                     #    3.01  insn per cycle         
+       1.485335177 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96986) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.247085e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.248950e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.248950e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.234763e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.236470e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.236470e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.733666 sec
-     2,134,815,435      cycles                           #    2.897 GHz                    
-     4,817,815,542      instructions                     #    2.26  insn per cycle         
-       0.737580401 seconds time elapsed
+TOTAL       :     0.735214 sec
+     2,134,795,694      cycles                           #    2.891 GHz                    
+     4,817,744,368      instructions                     #    2.26  insn per cycle         
+       0.739133829 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84904) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.255023e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.257521e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.257521e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.254949e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.257396e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.257396e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.644323 sec
-     1,868,915,722      cycles                           #    2.886 GHz                    
-     4,274,871,857      instructions                     #    2.29  insn per cycle         
-       0.648325497 seconds time elapsed
+TOTAL       :     0.644560 sec
+     1,871,614,525      cycles                           #    2.889 GHz                    
+     4,274,807,727      instructions                     #    2.28  insn per cycle         
+       0.648424122 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:80610) (512y:   46) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.514603e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.516833e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.516833e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.456942e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.459224e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.459224e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.708018 sec
-     1,353,648,095      cycles                           #    1.903 GHz                    
-     2,159,618,866      instructions                     #    1.60  insn per cycle         
-       0.711901071 seconds time elapsed
+TOTAL       :     0.714093 sec
+     1,353,332,363      cycles                           #    1.886 GHz                    
+     2,159,539,680      instructions                     #    1.60  insn per cycle         
+       0.718064585 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2878) (512y:   49) (512z:79298)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
index 2b69abf3e0..0346c64d8e 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-08_21:30:46
+DATE: 2023-11-09_17:51:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.751553e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.752429e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.752778e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.750539e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.751383e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.751707e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.662264 sec
-     5,791,514,994      cycles                           #    2.989 GHz                    
-    11,290,505,064      instructions                     #    1.95  insn per cycle         
-       1.994487544 seconds time elapsed
+TOTAL       :     1.659496 sec
+     5,776,495,417      cycles                           #    2.991 GHz                    
+    11,901,437,818      instructions                     #    2.06  insn per cycle         
+       2.001980183 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.318654e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.319320e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.319463e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.353072e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.353765e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.353865e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333435e-05 )  GeV^-6
-TOTAL       :     1.936834 sec
-     6,513,518,428      cycles                           #    2.942 GHz                    
-    13,310,876,477      instructions                     #    2.04  insn per cycle         
-       2.270995377 seconds time elapsed
+TOTAL       :     1.912117 sec
+     6,490,117,914      cycles                           #    2.968 GHz                    
+    14,058,143,997      instructions                     #    2.17  insn per cycle         
+       2.245070466 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.877357e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.877629e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.877629e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.137878e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.138152e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.138152e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.953085 sec
-    17,926,444,710      cycles                           #    3.010 GHz                    
-    53,580,674,845      instructions                     #    2.99  insn per cycle         
-       5.957045253 seconds time elapsed
+TOTAL       :     5.784705 sec
+    17,870,079,189      cycles                           #    3.088 GHz                    
+    53,579,576,519      instructions                     #    3.00  insn per cycle         
+       5.788683996 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:20206) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.538806e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.539230e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.539230e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.609484e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.609917e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.609917e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.497134 sec
-     4,549,359,025      cycles                           #    3.032 GHz                    
-    13,755,898,061      instructions                     #    3.02  insn per cycle         
-       1.501295301 seconds time elapsed
+TOTAL       :     1.467915 sec
+     4,547,996,475      cycles                           #    3.091 GHz                    
+    13,755,684,665      instructions                     #    3.02  insn per cycle         
+       1.471804589 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96606) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.000854e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.002553e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.002553e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.135956e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.137601e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.137601e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.759453 sec
-     2,151,217,111      cycles                           #    2.820 GHz                    
-     4,818,966,673      instructions                     #    2.24  insn per cycle         
-       0.763529614 seconds time elapsed
+TOTAL       :     0.744886 sec
+     2,148,725,562      cycles                           #    2.872 GHz                    
+     4,818,942,438      instructions                     #    2.24  insn per cycle         
+       0.748866334 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:85359) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.076028e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.078137e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.078137e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.165432e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.167702e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.167702e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.658855 sec
-     1,875,464,841      cycles                           #    2.832 GHz                    
-     4,275,819,002      instructions                     #    2.28  insn per cycle         
-       0.662852680 seconds time elapsed
+TOTAL       :     0.651347 sec
+     1,877,062,772      cycles                           #    2.867 GHz                    
+     4,276,072,949      instructions                     #    2.28  insn per cycle         
+       0.655395180 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:81075) (512y:   26) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.283691e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.286276e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.286276e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.338677e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.341123e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.341123e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.730286 sec
-     1,357,956,935      cycles                           #    1.851 GHz                    
-     2,164,994,730      instructions                     #    1.59  insn per cycle         
-       0.734341079 seconds time elapsed
+TOTAL       :     0.724588 sec
+     1,360,263,123      cycles                           #    1.868 GHz                    
+     2,164,996,305      instructions                     #    1.59  insn per cycle         
+       0.728742359 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3475) (512y:   34) (512z:79492)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index c2c8a96928..8c7934b526 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-08_21:31:33
+DATE: 2023-11-09_17:52:46
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.686778e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.687273e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.687409e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.693982e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.694475e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.694605e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.171824 sec
-     7,456,169,166      cycles                           #    2.995 GHz                    
-    14,898,137,129      instructions                     #    2.00  insn per cycle         
-       2.549362993 seconds time elapsed
+TOTAL       :     2.169924 sec
+     7,570,631,130      cycles                           #    3.042 GHz                    
+    15,729,510,401      instructions                     #    2.08  insn per cycle         
+       2.547214982 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.112892e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.113171e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.113203e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.111663e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.111941e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111967e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.401203 sec
-    11,249,483,891      cycles                           #    3.009 GHz                    
-    24,262,391,957      instructions                     #    2.16  insn per cycle         
-       3.794357278 seconds time elapsed
+TOTAL       :     3.399776 sec
+    11,464,618,476      cycles                           #    3.079 GHz                    
+    23,776,601,911      instructions                     #    2.07  insn per cycle         
+       3.779394913 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.772311e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.772526e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.772526e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.884667e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.884874e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.884874e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.810863 sec
-    19,135,542,784      cycles                           #    2.808 GHz                    
-    54,153,577,866      instructions                     #    2.83  insn per cycle         
-       6.814854998 seconds time elapsed
+TOTAL       :     6.698742 sec
+    19,113,024,695      cycles                           #    2.852 GHz                    
+    54,153,033,540      instructions                     #    2.83  insn per cycle         
+       6.702658032 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32066) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.589475e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.589562e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.589562e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.621402e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.621488e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.621488e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.327738 sec
-     9,417,973,850      cycles                           #    2.827 GHz                    
-    26,159,432,180      instructions                     #    2.78  insn per cycle         
-       3.331899471 seconds time elapsed
+TOTAL       :     3.261482 sec
+     9,398,350,643      cycles                           #    2.879 GHz                    
+    26,158,977,284      instructions                     #    2.78  insn per cycle         
+       3.265504352 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:96005) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.728829e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.729288e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.729288e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.791341e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.791883e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.791883e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.420979 sec
-     4,041,656,459      cycles                           #    2.838 GHz                    
-     9,227,906,681      instructions                     #    2.28  insn per cycle         
-       1.425059392 seconds time elapsed
+TOTAL       :     1.398109 sec
+     4,039,627,179      cycles                           #    2.883 GHz                    
+     9,228,162,054      instructions                     #    2.28  insn per cycle         
+       1.402192827 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:84155) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.219686e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.220314e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.220314e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.351031e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.351641e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.351641e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.256653 sec
-     3,545,597,499      cycles                           #    2.814 GHz                    
-     8,175,250,543      instructions                     #    2.31  insn per cycle         
-       1.260805357 seconds time elapsed
+TOTAL       :     1.218443 sec
+     3,518,124,342      cycles                           #    2.879 GHz                    
+     8,175,077,517      instructions                     #    2.32  insn per cycle         
+       1.222409560 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79844) (512y:   79) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.660023e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.660558e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.660558e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.765628e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.766216e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.766216e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.447622 sec
-     2,657,673,224      cycles                           #    1.832 GHz                    
-     4,154,915,823      instructions                     #    1.56  insn per cycle         
-       1.451764331 seconds time elapsed
+TOTAL       :     1.407689 sec
+     2,655,252,329      cycles                           #    1.882 GHz                    
+     4,154,811,941      instructions                     #    1.56  insn per cycle         
+       1.411617738 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2045) (512y:   93) (512z:78760)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
index 485a0059f2..b26dd71707 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2023-11-08_21:32:35
+DATE: 2023-11-09_17:53:47
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.688491e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.689012e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.689176e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.674330e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.674838e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.674969e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.168620 sec
-     7,451,542,055      cycles                           #    2.994 GHz                    
-    15,551,253,703      instructions                     #    2.09  insn per cycle         
-       2.545633518 seconds time elapsed
+TOTAL       :     2.174009 sec
+     7,611,935,314      cycles                           #    3.054 GHz                    
+    16,836,441,609      instructions                     #    2.21  insn per cycle         
+       2.551658489 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.107863e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.108135e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.108171e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.107370e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.107637e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.107663e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.405108 sec
-    11,192,783,578      cycles                           #    3.001 GHz                    
-    25,734,796,379      instructions                     #    2.30  insn per cycle         
-       3.786804275 seconds time elapsed
+TOTAL       :     3.413929 sec
+    11,386,114,072      cycles                           #    3.048 GHz                    
+    23,902,448,329      instructions                     #    2.10  insn per cycle         
+       3.794282526 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.066104e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.066369e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.066369e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.931164e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.931386e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.931386e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.548059 sec
-    19,079,779,477      cycles                           #    2.913 GHz                    
-    54,153,651,610      instructions                     #    2.84  insn per cycle         
-       6.552064899 seconds time elapsed
+TOTAL       :     6.662052 sec
+    19,079,234,145      cycles                           #    2.863 GHz                    
+    54,153,851,240      instructions                     #    2.84  insn per cycle         
+       6.666006074 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:32243) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.589149e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.589238e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.589238e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.620269e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.620358e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.620358e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.327579 sec
-     9,382,040,636      cycles                           #    2.817 GHz                    
-    26,078,619,591      instructions                     #    2.78  insn per cycle         
-       3.331633706 seconds time elapsed
+TOTAL       :     3.263602 sec
+     9,383,434,712      cycles                           #    2.872 GHz                    
+    26,078,178,648      instructions                     #    2.78  insn per cycle         
+       3.267785109 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:95899) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.662193e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.662639e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.662639e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.732412e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.732940e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.732940e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.447113 sec
-     4,073,138,574      cycles                           #    2.808 GHz                    
-     9,213,586,675      instructions                     #    2.26  insn per cycle         
-       1.451209760 seconds time elapsed
+TOTAL       :     1.420295 sec
+     4,071,120,210      cycles                           #    2.859 GHz                    
+     9,213,520,884      instructions                     #    2.26  insn per cycle         
+       1.424453149 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:83776) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.194379e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.195039e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.195039e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.308670e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.309271e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.309271e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.264023 sec
-     3,548,672,085      cycles                           #    2.800 GHz                    
-     8,168,128,611      instructions                     #    2.30  insn per cycle         
-       1.268138683 seconds time elapsed
+TOTAL       :     1.231097 sec
+     3,538,361,762      cycles                           #    2.867 GHz                    
+     8,168,060,632      instructions                     #    2.31  insn per cycle         
+       1.234995598 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2:79373) (512y:  229) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.707082e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.707666e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.707666e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.830037e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.830636e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.830636e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.430570 sec
-     2,620,935,291      cycles                           #    1.830 GHz                    
-     4,154,056,327      instructions                     #    1.58  insn per cycle         
-       1.434770233 seconds time elapsed
+TOTAL       :     1.385915 sec
+     2,618,303,188      cycles                           #    1.885 GHz                    
+     4,153,502,106      instructions                     #    1.59  insn per cycle         
+       1.389952232 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1492) (512y:  175) (512z:78776)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 45ec48d9b4..6d792821e6 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-08_21:26:06
+DATE: 2023-11-09_17:47:24
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.850720e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.319691e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.646421e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.838115e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.336717e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.669956e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.445414 sec
-     1,963,940,666      cycles                           #    2.941 GHz                    
-     2,761,951,187      instructions                     #    1.41  insn per cycle         
-       0.725454441 seconds time elapsed
+TOTAL       :     0.441585 sec
+     1,966,591,447      cycles                           #    2.991 GHz                    
+     2,767,621,879      instructions                     #    1.41  insn per cycle         
+       0.715631755 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.571453e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.132541e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.489040e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.614381e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.150528e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.499874e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.525989 sec
-     2,266,225,819      cycles                           #    2.950 GHz                    
-     3,255,459,976      instructions                     #    1.44  insn per cycle         
-       0.825689166 seconds time elapsed
+TOTAL       :     0.519980 sec
+     2,272,719,542      cycles                           #    3.015 GHz                    
+     3,282,015,462      instructions                     #    1.44  insn per cycle         
+       0.810626224 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.074272e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.096702e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.096702e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.097912e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.120487e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.120487e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.547835 sec
-     4,705,088,880      cycles                           #    3.034 GHz                    
-    13,467,070,551      instructions                     #    2.86  insn per cycle         
-       1.551905661 seconds time elapsed
+TOTAL       :     1.514129 sec
+     4,699,091,915      cycles                           #    3.096 GHz                    
+    13,466,947,436      instructions                     #    2.87  insn per cycle         
+       1.518294228 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  860) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.836387e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.906822e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.906822e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.983607e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.058142e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.058142e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.914850 sec
-     2,629,820,703      cycles                           #    2.863 GHz                    
-     7,555,643,977      instructions                     #    2.87  insn per cycle         
-       0.919312372 seconds time elapsed
+TOTAL       :     0.847498 sec
+     2,625,908,011      cycles                           #    3.086 GHz                    
+     7,555,492,469      instructions                     #    2.88  insn per cycle         
+       0.851823974 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3095) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.179916e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.388522e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.388522e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.394636e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.619511e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.619511e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.538121 sec
-     1,483,909,982      cycles                           #    2.739 GHz                    
-     3,122,112,991      instructions                     #    2.10  insn per cycle         
-       0.542506000 seconds time elapsed
+TOTAL       :     0.504120 sec
+     1,476,957,330      cycles                           #    2.909 GHz                    
+     3,122,047,526      instructions                     #    2.11  insn per cycle         
+       0.508259108 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.492769e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.748148e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.748148e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.754841e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.026481e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.026481e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.492302 sec
-     1,352,205,323      cycles                           #    2.727 GHz                    
-     2,983,986,621      instructions                     #    2.21  insn per cycle         
-       0.496759795 seconds time elapsed
+TOTAL       :     0.457617 sec
+     1,342,416,487      cycles                           #    2.911 GHz                    
+     2,984,161,058      instructions                     #    2.22  insn per cycle         
+       0.461673437 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2694) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.316160e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.426685e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.426685e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.547509e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.672958e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.672958e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.732612 sec
-     1,330,714,647      cycles                           #    1.807 GHz                    
-     1,956,053,126      instructions                     #    1.47  insn per cycle         
-       0.737097876 seconds time elapsed
+TOTAL       :     0.666989 sec
+     1,325,861,856      cycles                           #    1.977 GHz                    
+     1,955,811,920      instructions                     #    1.48  insn per cycle         
+       0.671229633 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1372) (512y:  106) (512z: 2173)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
index 9573fdc8ac..8337df6649 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-08_21:53:21
+DATE: 2023-11-09_18:11:37
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.674751e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.241786e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.241786e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.580013e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.253753e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.253753e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.472075 sec
-     2,011,630,446      cycles                           #    2.946 GHz                    
-     2,977,593,506      instructions                     #    1.48  insn per cycle         
-       0.740354864 seconds time elapsed
+TOTAL       :     0.470514 sec
+     2,029,905,705      cycles                           #    2.983 GHz                    
+     3,022,396,069      instructions                     #    1.49  insn per cycle         
+       0.739050820 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.306214e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.374405e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.374405e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.291351e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.372563e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.372563e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.746007 sec
-     2,930,819,015      cycles                           #    2.951 GHz                    
-     4,513,689,699      instructions                     #    1.54  insn per cycle         
-       1.051041659 seconds time elapsed
+TOTAL       :     0.742777 sec
+     2,970,951,255      cycles                           #    2.999 GHz                    
+     4,514,637,368      instructions                     #    1.52  insn per cycle         
+       1.047584901 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.067462e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.089978e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.089978e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.084622e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.107537e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.107537e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.563342 sec
-     4,743,647,659      cycles                           #    3.027 GHz                    
-    13,474,115,002      instructions                     #    2.84  insn per cycle         
-       1.567732700 seconds time elapsed
+TOTAL       :     1.538614 sec
+     4,724,111,132      cycles                           #    3.063 GHz                    
+    13,474,132,709      instructions                     #    2.85  insn per cycle         
+       1.542829058 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  860) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.931899e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.004806e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.004806e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.968452e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.042732e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.042732e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.876421 sec
-     2,657,928,129      cycles                           #    3.020 GHz                    
-     7,605,320,089      instructions                     #    2.86  insn per cycle         
-       0.880831982 seconds time elapsed
+TOTAL       :     0.860148 sec
+     2,657,657,312      cycles                           #    3.076 GHz                    
+     7,605,024,054      instructions                     #    2.86  insn per cycle         
+       0.864557816 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3095) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.284426e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.500691e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.500691e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.339093e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.562110e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.562110e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.528369 sec
-     1,515,520,073      cycles                           #    2.846 GHz                    
-     3,173,010,189      instructions                     #    2.09  insn per cycle         
-       0.533003329 seconds time elapsed
+TOTAL       :     0.520524 sec
+     1,514,451,185      cycles                           #    2.892 GHz                    
+     3,172,765,595      instructions                     #    2.09  insn per cycle         
+       0.524939185 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2917) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.626971e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.890157e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.890157e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.708754e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.978270e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.978270e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.480865 sec
-     1,378,241,594      cycles                           #    2.844 GHz                    
-     3,034,725,088      instructions                     #    2.20  insn per cycle         
-       0.485339539 seconds time elapsed
+TOTAL       :     0.469539 sec
+     1,371,933,121      cycles                           #    2.899 GHz                    
+     3,033,200,949      instructions                     #    2.21  insn per cycle         
+       0.473789571 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2694) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe
@@ -216,14 +216,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.445545e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.566504e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.566504e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.533145e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.657118e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.657118e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.701984 sec
-     1,365,857,372      cycles                           #    1.935 GHz                    
-     1,995,672,274      instructions                     #    1.46  insn per cycle         
-       0.706431315 seconds time elapsed
+TOTAL       :     0.676505 sec
+     1,357,238,089      cycles                           #    1.995 GHz                    
+     1,995,412,477      instructions                     #    1.47  insn per cycle         
+       0.680880338 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1372) (512y:  106) (512z: 2173)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
index a982c1092c..2ec6b9dc47 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-08_21:26:24
+DATE: 2023-11-09_17:47:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.808432e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.231946e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.554075e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.819082e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.206686e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.526015e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.446409 sec
-     1,914,723,819      cycles                           #    2.858 GHz                    
-     2,720,530,830      instructions                     #    1.42  insn per cycle         
-       0.726781250 seconds time elapsed
+TOTAL       :     0.443165 sec
+     1,961,379,003      cycles                           #    2.989 GHz                    
+     2,781,357,072      instructions                     #    1.42  insn per cycle         
+       0.713330689 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.542836e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.030986e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.390683e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.580414e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.034117e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.374431e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.529359 sec
-     2,191,645,691      cycles                           #    2.864 GHz                    
-     3,157,433,372      instructions                     #    1.44  insn per cycle         
-       0.823018193 seconds time elapsed
+TOTAL       :     0.527713 sec
+     2,200,622,669      cycles                           #    2.860 GHz                    
+     3,134,287,672      instructions                     #    1.42  insn per cycle         
+       0.826392715 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.036305e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.058434e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.058434e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.033805e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.055304e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.055304e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.603791 sec
-     4,708,850,584      cycles                           #    2.929 GHz                    
-    13,461,227,684      instructions                     #    2.86  insn per cycle         
-       1.607981971 seconds time elapsed
+TOTAL       :     1.607267 sec
+     4,703,491,098      cycles                           #    2.920 GHz                    
+    13,461,246,606      instructions                     #    2.86  insn per cycle         
+       1.611368977 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  849) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.854678e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.928501e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.928501e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.985735e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.061359e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.061359e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.906299 sec
-     2,638,123,420      cycles                           #    2.899 GHz                    
-     7,554,662,347      instructions                     #    2.86  insn per cycle         
-       0.910729092 seconds time elapsed
+TOTAL       :     0.845910 sec
+     2,624,687,455      cycles                           #    3.090 GHz                    
+     7,554,687,341      instructions                     #    2.88  insn per cycle         
+       0.850163593 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3088) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.120658e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.331862e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.331862e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.383208e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.600735e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.600735e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.548282 sec
-     1,490,121,110      cycles                           #    2.699 GHz                    
-     3,120,571,278      instructions                     #    2.09  insn per cycle         
-       0.552853693 seconds time elapsed
+TOTAL       :     0.505300 sec
+     1,477,429,478      cycles                           #    2.904 GHz                    
+     3,120,730,266      instructions                     #    2.11  insn per cycle         
+       0.509369657 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2900) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.460892e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.716719e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.716719e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.736623e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.003084e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.003084e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.496477 sec
-     1,349,987,385      cycles                           #    2.699 GHz                    
-     2,981,775,320      instructions                     #    2.21  insn per cycle         
-       0.500801099 seconds time elapsed
+TOTAL       :     0.460033 sec
+     1,340,907,328      cycles                           #    2.892 GHz                    
+     2,981,159,149      instructions                     #    2.22  insn per cycle         
+       0.464174349 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2670) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.283025e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.395178e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.395178e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.537070e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.658764e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.658764e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.742923 sec
-     1,336,539,142      cycles                           #    1.791 GHz                    
-     1,954,402,399      instructions                     #    1.46  insn per cycle         
-       0.747445158 seconds time elapsed
+TOTAL       :     0.669277 sec
+     1,326,031,179      cycles                           #    1.971 GHz                    
+     1,954,098,862      instructions                     #    1.47  insn per cycle         
+       0.673467594 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1348) (512y:  106) (512z: 2173)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 0870ac1612..25d66c7041 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-08_21:26:42
+DATE: 2023-11-09_17:47:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.731772e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.218499e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.346344e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.746320e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.236957e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.360917e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.443647 sec
-     1,860,995,101      cycles                           #    2.829 GHz                    
-     2,577,640,181      instructions                     #    1.39  insn per cycle         
-       0.715495532 seconds time elapsed
+TOTAL       :     0.440483 sec
+     1,942,018,247      cycles                           #    2.976 GHz                    
+     2,734,614,888      instructions                     #    1.41  insn per cycle         
+       0.710582968 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.975472e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.830515e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.954464e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.010716e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.836484e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.960610e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.571361e+02 +- 2.114021e+02 )  GeV^-2
-TOTAL       :     0.479313 sec
-     1,996,286,635      cycles                           #    2.831 GHz                    
-     2,879,932,210      instructions                     #    1.44  insn per cycle         
-       0.762534142 seconds time elapsed
+TOTAL       :     0.476376 sec
+     2,093,828,578      cycles                           #    2.973 GHz                    
+     2,983,215,115      instructions                     #    1.42  insn per cycle         
+       0.763925577 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.068560e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.092999e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.092999e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.150183e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.175828e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.175828e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.555116 sec
-     4,461,765,661      cycles                           #    2.863 GHz                    
-    13,052,553,175      instructions                     #    2.93  insn per cycle         
-       1.559192669 seconds time elapsed
+TOTAL       :     1.444504 sec
+     4,454,034,181      cycles                           #    3.077 GHz                    
+    13,052,158,813      instructions                     #    2.93  insn per cycle         
+       1.448436066 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  745) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.882925e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.070631e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.070631e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.075306e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.270472e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.270472e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429902e+01 )  GeV^-2
-TOTAL       :     0.589518 sec
-     1,706,750,598      cycles                           #    2.878 GHz                    
-     4,515,023,670      instructions                     #    2.65  insn per cycle         
-       0.593859816 seconds time elapsed
+TOTAL       :     0.552218 sec
+     1,700,873,014      cycles                           #    3.061 GHz                    
+     4,515,081,496      instructions                     #    2.65  insn per cycle         
+       0.556201186 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3601) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.765834e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.493743e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.493743e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.031649e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.790374e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.790374e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.305319 sec
-       853,645,854      cycles                           #    2.763 GHz                    
-     1,898,477,314      instructions                     #    2.22  insn per cycle         
-       0.309705869 seconds time elapsed
+TOTAL       :     0.291602 sec
+       850,563,357      cycles                           #    2.883 GHz                    
+     1,898,510,633      instructions                     #    2.23  insn per cycle         
+       0.295657443 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3491) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.141881e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.979826e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.979826e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.014318e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.832565e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.832565e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.287752 sec
-       800,772,449      cycles                           #    2.748 GHz                    
-     1,821,769,219      instructions                     #    2.28  insn per cycle         
-       0.292040341 seconds time elapsed
+TOTAL       :     0.293482 sec
+       802,625,962      cycles                           #    2.700 GHz                    
+     1,821,591,063      instructions                     #    2.27  insn per cycle         
+       0.297764671 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3335) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe
@@ -194,9 +194,9 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        29,120,008      cycles                           #    2.647 GHz                    
-        41,681,258      instructions                     #    1.43  insn per cycle         
-       0.011379573 seconds time elapsed
+        29,732,895      cycles                           #    2.697 GHz                    
+        41,670,508      instructions                     #    1.40  insn per cycle         
+       0.011409242 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1969) (512y:   32) (512z: 2383)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
index 0597ee22a3..687daa906c 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-08_21:53:38
+DATE: 2023-11-09_18:11:56
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.639706e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.257120e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.257120e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.747186e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.237510e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.237510e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.017654e+01 +- 1.429184e+01 )  GeV^-2
-TOTAL       :     0.449974 sec
-     1,947,595,961      cycles                           #    2.942 GHz                    
-     2,880,549,080      instructions                     #    1.48  insn per cycle         
-       0.719459148 seconds time elapsed
+TOTAL       :     0.451024 sec
+     1,971,298,564      cycles                           #    2.989 GHz                    
+     2,921,186,990      instructions                     #    1.48  insn per cycle         
+       0.718171404 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
@@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.168463e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.812098e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.812098e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.154719e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.829239e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.829239e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.609942e+02 +- 2.115590e+02 )  GeV^-2
-TOTAL       :     0.616345 sec
-     2,486,978,198      cycles                           #    2.935 GHz                    
-     3,790,315,811      instructions                     #    1.52  insn per cycle         
-       0.904030203 seconds time elapsed
+TOTAL       :     0.620701 sec
+     2,514,914,307      cycles                           #    2.959 GHz                    
+     3,812,117,615      instructions                     #    1.52  insn per cycle         
+       0.908673198 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2
@@ -104,14 +104,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.130080e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.155980e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.155980e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.130761e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.156131e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.156131e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.473693 sec
-     4,471,154,333      cycles                           #    3.027 GHz                    
-    13,056,458,670      instructions                     #    2.92  insn per cycle         
-       1.477812555 seconds time elapsed
+TOTAL       :     1.472797 sec
+     4,472,979,155      cycles                           #    3.030 GHz                    
+    13,056,761,338      instructions                     #    2.92  insn per cycle         
+       1.477050712 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  745) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe
@@ -132,14 +132,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.025004e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.219703e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.219703e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.077738e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.274919e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.274919e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429902e+01 )  GeV^-2
-TOTAL       :     0.566150 sec
-     1,723,667,712      cycles                           #    3.025 GHz                    
-     4,563,297,886      instructions                     #    2.65  insn per cycle         
-       0.570436411 seconds time elapsed
+TOTAL       :     0.555619 sec
+     1,722,866,665      cycles                           #    3.081 GHz                    
+     4,563,322,469      instructions                     #    2.65  insn per cycle         
+       0.559797755 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3601) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe
@@ -160,14 +160,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.858362e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.587732e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.587732e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.956375e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.689121e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.689121e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.303895 sec
-       871,800,602      cycles                           #    2.835 GHz                    
-     1,935,423,519      instructions                     #    2.22  insn per cycle         
-       0.308064640 seconds time elapsed
+TOTAL       :     0.298686 sec
+       869,037,023      cycles                           #    2.875 GHz                    
+     1,935,544,426      instructions                     #    2.23  insn per cycle         
+       0.302811266 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3491) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe
@@ -188,14 +188,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.340209e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.201036e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.201036e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.465666e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.344453e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.344453e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.282456 sec
-       818,779,897      cycles                           #    2.862 GHz                    
-     1,858,681,592      instructions                     #    2.27  insn per cycle         
-       0.286757422 seconds time elapsed
+TOTAL       :     0.276910 sec
+       817,448,595      cycles                           #    2.915 GHz                    
+     1,858,610,780      instructions                     #    2.27  insn per cycle         
+       0.280974833 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3335) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe
@@ -211,9 +211,9 @@ OK (relative difference <= 5E-3)
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 WARNING! Instantiate host Bridge (nevt=16384)
-        37,403,629      cycles                           #    2.691 GHz                    
-        50,469,890      instructions                     #    1.35  insn per cycle         
-       0.014372629 seconds time elapsed
+        37,531,426      cycles                           #    2.805 GHz                    
+        50,366,354      instructions                     #    1.34  insn per cycle         
+       0.013813903 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1969) (512y:   32) (512z: 2383)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
index 1f88f16cf0..8bc404b84b 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-08_21:26:59
+DATE: 2023-11-09_17:48:15
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.710525e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.199979e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.326321e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.693711e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.215042e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.339602e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.440656 sec
-     1,914,804,492      cycles                           #    2.925 GHz                    
-     2,653,138,253      instructions                     #    1.39  insn per cycle         
-       0.711749358 seconds time elapsed
+TOTAL       :     0.438778 sec
+     1,941,964,603      cycles                           #    2.979 GHz                    
+     2,729,283,404      instructions                     #    1.41  insn per cycle         
+       0.709465120 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.891594e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.784645e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.904203e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.971564e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.799531e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.917090e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.571361e+02 +- 2.114021e+02 )  GeV^-2
-TOTAL       :     0.476610 sec
-     2,083,856,457      cycles                           #    2.940 GHz                    
-     2,965,032,628      instructions                     #    1.42  insn per cycle         
-       0.765879589 seconds time elapsed
+TOTAL       :     0.470244 sec
+     2,084,172,928      cycles                           #    3.008 GHz                    
+     2,971,888,877      instructions                     #    1.43  insn per cycle         
+       0.750810002 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.129719e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.155174e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.155174e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.156981e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.183336e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.183336e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.470593 sec
-     4,452,128,732      cycles                           #    3.020 GHz                    
-    13,033,118,765      instructions                     #    2.93  insn per cycle         
-       1.474660881 seconds time elapsed
+TOTAL       :     1.435690 sec
+     4,451,626,158      cycles                           #    3.094 GHz                    
+    13,032,987,489      instructions                     #    2.93  insn per cycle         
+       1.439578191 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  727) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.040157e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.234537e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.234537e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.129722e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.328624e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.328624e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018563e+01 +- 1.429902e+01 )  GeV^-2
-TOTAL       :     0.558718 sec
-     1,691,566,910      cycles                           #    3.008 GHz                    
-     4,511,110,866      instructions                     #    2.67  insn per cycle         
-       0.562886591 seconds time elapsed
+TOTAL       :     0.542754 sec
+     1,689,058,698      cycles                           #    3.092 GHz                    
+     4,510,968,389      instructions                     #    2.67  insn per cycle         
+       0.546880720 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3589) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.942184e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.690459e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.690459e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.059640e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.837369e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.837369e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.296099 sec
-       853,486,904      cycles                           #    2.847 GHz                    
-     1,895,390,282      instructions                     #    2.22  insn per cycle         
-       0.300311325 seconds time elapsed
+TOTAL       :     0.290425 sec
+       852,449,044      cycles                           #    2.901 GHz                    
+     1,895,470,717      instructions                     #    2.22  insn per cycle         
+       0.294595816 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3461) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.374489e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.242458e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.242458e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.503379e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.376998e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.376998e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.277008 sec
-       800,885,707      cycles                           #    2.855 GHz                    
-     1,817,516,411      instructions                     #    2.27  insn per cycle         
-       0.281135474 seconds time elapsed
+TOTAL       :     0.271227 sec
+       799,263,402      cycles                           #    2.909 GHz                    
+     1,817,410,136      instructions                     #    2.27  insn per cycle         
+       0.275264605 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3298) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe
@@ -194,9 +194,9 @@ OK (relative difference <= 5E-3)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP=
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
-        28,754,068      cycles                           #    2.640 GHz                    
-        40,955,371      instructions                     #    1.42  insn per cycle         
-       0.011419598 seconds time elapsed
+        28,811,890      cycles                           #    2.702 GHz                    
+        40,903,960      instructions                     #    1.42  insn per cycle         
+       0.011044926 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1932) (512y:   32) (512z: 2383)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index d5ef07e007..eab7ec279c 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-08_21:27:16
+DATE: 2023-11-09_17:48:31
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.821562e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.300473e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.628825e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.897435e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.394394e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.726655e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.447954 sec
-     1,932,564,435      cycles                           #    2.921 GHz                    
-     2,743,560,511      instructions                     #    1.42  insn per cycle         
-       0.719731147 seconds time elapsed
+TOTAL       :     0.442647 sec
+     2,004,900,013      cycles                           #    3.007 GHz                    
+     2,826,895,466      instructions                     #    1.41  insn per cycle         
+       0.724412660 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.575286e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.143575e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.499311e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.620708e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.161875e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.511766e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.523239 sec
-     2,243,203,062      cycles                           #    2.949 GHz                    
-     3,244,551,518      instructions                     #    1.45  insn per cycle         
-       0.818196957 seconds time elapsed
+TOTAL       :     0.520790 sec
+     2,257,400,704      cycles                           #    2.997 GHz                    
+     3,259,917,218      instructions                     #    1.44  insn per cycle         
+       0.810908697 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.069116e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.091100e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.091100e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.088540e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.110933e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.110933e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.554535 sec
-     4,725,018,841      cycles                           #    3.035 GHz                    
-    13,469,753,614      instructions                     #    2.85  insn per cycle         
-       1.558693291 seconds time elapsed
+TOTAL       :     1.526966 sec
+     4,723,154,452      cycles                           #    3.087 GHz                    
+    13,469,602,667      instructions                     #    2.85  insn per cycle         
+       1.531097432 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  840) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.970313e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.046371e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.046371e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.988494e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.063440e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.063440e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.853333 sec
-     2,596,868,107      cycles                           #    3.030 GHz                    
-     7,388,624,187      instructions                     #    2.85  insn per cycle         
-       0.857591565 seconds time elapsed
+TOTAL       :     0.845345 sec
+     2,599,329,855      cycles                           #    3.062 GHz                    
+     7,388,612,618      instructions                     #    2.84  insn per cycle         
+       0.849529924 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3073) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.332912e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.554037e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.554037e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.404332e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.629825e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.629825e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.513899 sec
-     1,466,763,063      cycles                           #    2.835 GHz                    
-     3,057,876,447      instructions                     #    2.08  insn per cycle         
-       0.518107133 seconds time elapsed
+TOTAL       :     0.502979 sec
+     1,466,711,057      cycles                           #    2.896 GHz                    
+     3,057,623,965      instructions                     #    2.08  insn per cycle         
+       0.507143043 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 3013) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.777029e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.058907e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.058907e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.803609e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.085245e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.085245e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.455720 sec
-     1,306,910,741      cycles                           #    2.845 GHz                    
-     2,932,818,419      instructions                     #    2.24  insn per cycle         
-       0.460076062 seconds time elapsed
+TOTAL       :     0.452713 sec
+     1,309,685,857      cycles                           #    2.871 GHz                    
+     2,932,566,248      instructions                     #    2.24  insn per cycle         
+       0.456835979 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2799) (512y:  110) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.391166e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.500870e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.500870e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.397391e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.510097e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.510097e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.709219 sec
-     1,365,455,058      cycles                           #    1.916 GHz                    
-     1,971,797,344      instructions                     #    1.44  insn per cycle         
-       0.713482957 seconds time elapsed
+TOTAL       :     0.707515 sec
+     1,366,670,273      cycles                           #    1.922 GHz                    
+     1,971,774,412      instructions                     #    1.44  insn per cycle         
+       0.711692701 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1700) (512y:  114) (512z: 2171)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
index 6e69f82aee..804124a528 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
@@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2023-11-08_21:27:34
+DATE: 2023-11-09_17:48:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.812345e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.208019e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.520614e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.811798e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.176696e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.495530e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.446229 sec
-     1,955,610,259      cycles                           #    2.936 GHz                    
-     2,744,647,203      instructions                     #    1.40  insn per cycle         
-       0.725146174 seconds time elapsed
+TOTAL       :     0.443833 sec
+     2,007,951,396      cycles                           #    2.999 GHz                    
+     2,822,905,809      instructions                     #    1.41  insn per cycle         
+       0.728453943 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1
 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.529337e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.985996e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.326246e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.587196e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.041060e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.377539e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.522218 sec
-     2,238,618,942      cycles                           #    2.943 GHz                    
-     3,202,939,408      instructions                     #    1.43  insn per cycle         
-       0.817604471 seconds time elapsed
+TOTAL       :     0.523091 sec
+     2,298,379,472      cycles                           #    2.986 GHz                    
+     3,299,691,245      instructions                     #    1.44  insn per cycle         
+       0.827230276 seconds time elapsed
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2
@@ -91,14 +91,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.065182e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.087517e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.087517e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.081127e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.103599e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.103599e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.560291 sec
-     4,729,799,308      cycles                           #    3.025 GHz                    
-    13,455,876,389      instructions                     #    2.84  insn per cycle         
-       1.564515481 seconds time elapsed
+TOTAL       :     1.537190 sec
+     4,726,723,623      cycles                           #    3.068 GHz                    
+    13,455,766,194      instructions                     #    2.85  insn per cycle         
+       1.541247326 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:  827) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe
@@ -118,14 +118,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.946971e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.020229e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.020229e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.984806e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.061569e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.061569e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.863034 sec
-     2,601,868,480      cycles                           #    3.003 GHz                    
-     7,392,543,085      instructions                     #    2.84  insn per cycle         
-       0.867199240 seconds time elapsed
+TOTAL       :     0.846326 sec
+     2,602,293,302      cycles                           #    3.065 GHz                    
+     7,392,635,608      instructions                     #    2.84  insn per cycle         
+       0.850454133 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4: 3062) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe
@@ -145,14 +145,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.323539e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.538773e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.538773e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.380134e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.599128e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.599128e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.514660 sec
-     1,469,850,553      cycles                           #    2.835 GHz                    
-     3,058,079,146      instructions                     #    2.08  insn per cycle         
-       0.519050232 seconds time elapsed
+TOTAL       :     0.506085 sec
+     1,466,467,612      cycles                           #    2.876 GHz                    
+     3,058,106,145      instructions                     #    2.09  insn per cycle         
+       0.510457197 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2990) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe
@@ -172,14 +172,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.767525e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.049329e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.049329e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.778195e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.059768e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.059768e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.456721 sec
-     1,309,025,943      cycles                           #    2.843 GHz                    
-     2,933,534,120      instructions                     #    2.24  insn per cycle         
-       0.460967936 seconds time elapsed
+TOTAL       :     0.455384 sec
+     1,311,774,111      cycles                           #    2.858 GHz                    
+     2,933,399,487      instructions                     #    2.24  insn per cycle         
+       0.459674797 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2775) (512y:  110) (512z:    0)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe
@@ -199,14 +199,14 @@ Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
 OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.405794e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.516831e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.516831e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.385780e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.497799e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.497799e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.704869 sec
-     1,364,487,579      cycles                           #    1.926 GHz                    
-     1,971,713,310      instructions                     #    1.45  insn per cycle         
-       0.709028391 seconds time elapsed
+TOTAL       :     0.711136 sec
+     1,370,131,308      cycles                           #    1.917 GHz                    
+     1,971,581,787      instructions                     #    1.44  insn per cycle         
+       0.715633425 seconds time elapsed
 =Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1676) (512y:  114) (512z: 2171)
 -------------------------------------------------------------------------
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe

From b89bf4cd355304d673506ce11aff4dbc3c4e04c4 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Fri, 10 Nov 2023 06:38:26 +0100
Subject: [PATCH 14/14] [gpucpp] ** COMPLETE GPUCPP** rerun 18 tmad tests after
 the upgrade to 3.5.2, no change in functionality or performance

STARTED AT Thu Nov  9 06:24:51 PM CET 2023
ENDED   AT Thu Nov  9 10:43:11 PM CET 2023

Status=0

24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
---
 .../log_eemumu_mad_d_inl0_hrd0.txt            | 138 ++++++++---------
 .../log_eemumu_mad_f_inl0_hrd0.txt            | 134 ++++++++---------
 .../log_eemumu_mad_m_inl0_hrd0.txt            | 132 ++++++++---------
 .../log_ggtt_mad_d_inl0_hrd0.txt              | 132 ++++++++---------
 .../log_ggtt_mad_f_inl0_hrd0.txt              | 136 ++++++++---------
 .../log_ggtt_mad_m_inl0_hrd0.txt              | 136 ++++++++---------
 .../log_ggttg_mad_d_inl0_hrd0.txt             | 134 ++++++++---------
 .../log_ggttg_mad_f_inl0_hrd0.txt             | 136 ++++++++---------
 .../log_ggttg_mad_m_inl0_hrd0.txt             | 140 +++++++++---------
 .../log_ggttgg_mad_d_inl0_hrd0.txt            | 132 ++++++++---------
 .../log_ggttgg_mad_f_inl0_hrd0.txt            | 136 ++++++++---------
 .../log_ggttgg_mad_m_inl0_hrd0.txt            | 138 ++++++++---------
 .../log_ggttggg_mad_d_inl0_hrd0.txt           | 134 ++++++++---------
 .../log_ggttggg_mad_f_inl0_hrd0.txt           | 136 ++++++++---------
 .../log_ggttggg_mad_m_inl0_hrd0.txt           | 138 ++++++++---------
 .../log_gqttq_mad_d_inl0_hrd0.txt             | 138 ++++++++---------
 .../log_gqttq_mad_f_inl0_hrd0.txt             | 138 ++++++++---------
 .../log_gqttq_mad_m_inl0_hrd0.txt             | 136 ++++++++---------
 18 files changed, 1222 insertions(+), 1222 deletions(-)

diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index 383178f656..16028d3846 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e
 CUDACPP_BUILDDIR='.'
 
 
+make USEBUILDDIR=1 AVX=none
 
 
-make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:08:17
+DATE: 2023-11-09_18:26:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 3893 events (found 7395 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6257s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6178s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6383s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6302s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0081s for     8192 events => throughput is 1.01E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1766s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1680s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.60E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1882s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1797s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.66E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4156s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3301s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0855s for    90112 events => throughput is 1.05E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4280s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3388s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0892s for    90112 events => throughput is 1.01E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1878s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1815s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for     8192 events => throughput is 1.30E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1893s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1830s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for     8192 events => throughput is 1.28E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4131s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3430s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0701s for    90112 events => throughput is 1.29E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4144s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3434s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0710s for    90112 events => throughput is 1.27E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.227734e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.246747e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.242066e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.254814e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1813s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1774s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0039s for     8192 events => throughput is 2.10E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1854s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1814s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0040s for     8192 events => throughput is 2.06E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3861s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3422s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0439s for    90112 events => throughput is 2.05E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3872s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3433s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0440s for    90112 events => throughput is 2.05E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.002470e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.008841e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.006601e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.041604e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1828s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1796s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0032s for     8192 events => throughput is 2.57E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1821s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1790s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.66E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3731s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3402s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0329s for    90112 events => throughput is 2.74E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3764s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3431s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0333s for    90112 events => throughput is 2.71E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.620678e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.648221e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.819190e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.737599e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1815s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1784s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.71E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1828s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1800s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.94E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3723s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3407s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0316s for    90112 events => throughput is 2.85E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3726s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3409s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0317s for    90112 events => throughput is 2.84E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.820321e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.822405e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.842053e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.840653e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1819s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1785s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0034s for     8192 events => throughput is 2.38E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.34E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3824s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3420s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0404s for    90112 events => throughput is 2.23E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3845s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3458s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0387s for    90112 events => throughput is 2.33E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.075096e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.213684e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.166357e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.288308e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5934s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5929s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.63E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5941s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5936s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.56E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7863s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7814s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0050s for    90112 events => throughput is 1.81E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7643s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7594s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.85E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.141020e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.122558e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.873271e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.902108e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.990853e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.029032e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.361218e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.427964e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.939860e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.990174e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.944408e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.966232e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.975323e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.011562e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.124184e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.099952e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index 4b3b0b9b07..bed8731e5c 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:08:35
+DATE: 2023-11-09_18:26:24
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 3893 events (found 7395 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6276s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6195s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0081s for     8192 events => throughput is 1.01E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6375s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6295s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0080s for     8192 events => throughput is 1.03E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1778s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1697s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0081s for     8192 events => throughput is 1.01E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1779s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1700s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4139s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3290s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0849s for    90112 events => throughput is 1.06E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4168s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3310s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0858s for    90112 events => throughput is 1.05E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166087172673] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1874s
+ [COUNTERS] PROGRAM TOTAL          :    0.1876s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.1813s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0062s for     8192 events => throughput is 1.33E+06 events/s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for     8192 events => throughput is 1.31E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501907796603360E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4142s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3454s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0687s for    90112 events => throughput is 1.31E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4132s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3439s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0693s for    90112 events => throughput is 1.30E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.261327e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.290954e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.287607e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.269110e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1786s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1761s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0025s for     8192 events => throughput is 3.33E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1799s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1773s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.11E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3651s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3380s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0270s for    90112 events => throughput is 3.33E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3696s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3425s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0271s for    90112 events => throughput is 3.33E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.137840e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.211958e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.298087e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.331194e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1864s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1841s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.61E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1846s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1823s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.66E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3868s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3609s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0260s for    90112 events => throughput is 3.47E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3689s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3438s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0251s for    90112 events => throughput is 3.59E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.442542e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.583243e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.634986e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.664821e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1856s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1832s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.50E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1881s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1858s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.64E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3685s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3440s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0245s for    90112 events => throughput is 3.68E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4013s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3744s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0269s for    90112 events => throughput is 3.35E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.588607e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.708142e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.872180e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.716354e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166440400542] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1887s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1865s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.71E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1876s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1854s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.69E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501908978565555E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3693s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3442s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0251s for    90112 events => throughput is 3.59E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3791s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3532s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0259s for    90112 events => throughput is 3.47E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.372399e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.388042e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.586770e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.799218e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5951s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5946s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.69E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5957s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5952s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.72E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7616s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7570s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0046s for    90112 events => throughput is 1.96E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7596s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7551s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0046s for    90112 events => throughput is 1.97E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.577355e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.613080e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.822297e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.898284e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.937359e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.543811e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.046785e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.026187e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.102347e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.468953e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.203659e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.241582e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.365649e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.812787e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.422918e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.411277e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 9a947a36a5..8b8c11aaf5 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
+make USEBUILDDIR=1 AVX=sse4
 
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -17,13 +17,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:08:51
+DATE: 2023-11-09_18:26:42
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 3893 events (found 7395 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6267s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6186s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0081s for     8192 events => throughput is 1.01E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6293s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6211s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0082s for     8192 events => throughput is 1.00E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1781s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1703s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1828s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1742s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.52E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4162s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3310s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0853s for    90112 events => throughput is 1.06E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4185s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3320s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0865s for    90112 events => throughput is 1.04E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,8 +134,8 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1898s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1832s
+ [COUNTERS] PROGRAM TOTAL          :    0.1883s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1817s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for     8192 events => throughput is 1.24E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4158s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3438s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0720s for    90112 events => throughput is 1.25E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4177s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3452s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0725s for    90112 events => throughput is 1.24E+06 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.204267e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.192297e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.208788e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.206668e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1829s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1790s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0038s for     8192 events => throughput is 2.13E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1831s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0039s for     8192 events => throughput is 2.12E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3821s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3399s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0422s for    90112 events => throughput is 2.14E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3847s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3426s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0421s for    90112 events => throughput is 2.14E+06 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.047978e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.077610e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.116427e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.127798e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1871s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1841s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.76E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1845s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1815s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.72E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3733s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3393s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0340s for    90112 events => throughput is 2.65E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3749s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3413s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0336s for    90112 events => throughput is 2.68E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.642107e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.567900e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.787956e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.786544e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1804s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1776s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.89E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1822s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1793s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.82E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,8 +395,8 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3708s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3395s
+ [COUNTERS] PROGRAM TOTAL          :    0.3728s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3415s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0313s for    90112 events => throughput is 2.88E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.821887e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.787216e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.874115e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.802177e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1829s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1796s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0034s for     8192 events => throughput is 2.43E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.33E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3842s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3465s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0377s for    90112 events => throughput is 2.39E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3808s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3437s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0371s for    90112 events => throughput is 2.43E+06 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.237740e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.306669e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.366800e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.302969e+06                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1
  [UNWEIGHT] Wrote 1611 events (found 1616 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5935s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5930s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.64E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5952s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5947s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.67E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1803 events (found 1808 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7704s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7654s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0050s for    90112 events => throughput is 1.82E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7615s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7567s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0048s for    90112 events => throughput is 1.88E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.007927e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.094813e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.918411e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.912678e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.018629e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.000800e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.348012e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.334730e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.994146e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.018486e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.917104e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.914438e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.983673e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.024074e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.123333e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.129214e+08                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 3e628018af..824a8e25d5 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -4,8 +4,8 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:09:08
+DATE: 2023-11-09_18:26:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 420 events (found 1577 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3517s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3111s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0406s for     8192 events => throughput is 2.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3548s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3140s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0408s for     8192 events => throughput is 2.01E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3086s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2681s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0405s for     8192 events => throughput is 2.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3094s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2683s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0411s for     8192 events => throughput is 1.99E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6533s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2091s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4442s for    90112 events => throughput is 2.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6956s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2429s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4528s for    90112 events => throughput is 1.99E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3456s
+ [COUNTERS] PROGRAM TOTAL          :    0.3445s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.3078s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0379s for     8192 events => throughput is 2.16E+05 events/s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0367s for     8192 events => throughput is 2.23E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6716s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2645s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4071s for    90112 events => throughput is 2.21E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6787s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2659s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4128s for    90112 events => throughput is 2.18E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.224417e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.206364e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.212367e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.211188e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3132s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2919s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0214s for     8192 events => throughput is 3.83E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3133s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2921s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0211s for     8192 events => throughput is 3.88E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4780s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2422s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2359s for    90112 events => throughput is 3.82E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4919s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2565s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2354s for    90112 events => throughput is 3.83E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.777989e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.806213e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.740213e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.795645e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2955s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2823s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0132s for     8192 events => throughput is 6.21E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2981s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2850s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0131s for     8192 events => throughput is 6.23E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3915s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2460s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1455s for    90112 events => throughput is 6.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3832s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2385s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1448s for    90112 events => throughput is 6.22E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.030466e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.053490e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.192047e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.106690e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2946s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2826s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0119s for     8192 events => throughput is 6.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2943s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2825s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0117s for     8192 events => throughput is 6.97E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3659s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2362s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1297s for    90112 events => throughput is 6.95E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3653s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2365s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1287s for    90112 events => throughput is 7.00E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.841360e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.704382e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.816529e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.799597e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3142s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2951s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0191s for     8192 events => throughput is 4.29E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3082s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2885s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0198s for     8192 events => throughput is 4.15E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4624s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2517s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2106s for    90112 events => throughput is 4.28E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6624s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4291s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2333s for    90112 events => throughput is 3.86E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.955720e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.938387e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.094472e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.929754e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,8 +514,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6940s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6935s
+ [COUNTERS] PROGRAM TOTAL          :    0.6969s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6963s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.46E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7032s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6968s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for    90112 events => throughput is 1.41E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6570s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6507s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for    90112 events => throughput is 1.43E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.103744e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.071187e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.691695e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.692368e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.194593e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.183000e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.070229e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.074203e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.168601e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.195387e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.149757e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.150737e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.190999e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.203236e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.017633e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.040065e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 0321a276a0..6ff403b879 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -4,9 +4,9 @@ CUDACPP_BUILDDIR='.'
 make USEBUILDDIR=1 AVX=none
 
 
-
-make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
+make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=512y
 
 make USEBUILDDIR=1 AVX=512z
@@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:09:34
+DATE: 2023-11-09_18:27:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 420 events (found 1577 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3489s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3083s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0406s for     8192 events => throughput is 2.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3494s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3093s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0401s for     8192 events => throughput is 2.04E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3073s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2670s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0403s for     8192 events => throughput is 2.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3068s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2663s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0405s for     8192 events => throughput is 2.02E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6502s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2060s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4442s for    90112 events => throughput is 2.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6536s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2070s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4466s for    90112 events => throughput is 2.02E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690706767555099] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3425s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3079s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0345s for     8192 events => throughput is 2.37E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3397s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3049s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0348s for     8192 events => throughput is 2.35E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782605295497] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6631s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2770s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3861s for    90112 events => throughput is 2.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6398s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2589s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3809s for    90112 events => throughput is 2.37E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.342613e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.342865e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.319125e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.331036e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690702885183541] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3002s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2858s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0145s for     8192 events => throughput is 5.66E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2992s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2845s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0147s for     8192 events => throughput is 5.59E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223778858016772] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3973s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2359s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1614s for    90112 events => throughput is 5.58E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4772s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3090s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1682s for    90112 events => throughput is 5.36E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.270911e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.225442e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.359921e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.299428e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2834s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2758s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0075s for     8192 events => throughput is 1.09E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3093s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3001s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0092s for     8192 events => throughput is 8.88E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3197s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2356s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0842s for    90112 events => throughput is 1.07E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3166s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2317s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0848s for    90112 events => throughput is 1.06E+06 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.026437e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.025673e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.028771e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.017812e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,8 +362,8 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2894s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2821s
+ [COUNTERS] PROGRAM TOTAL          :    0.2858s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2785s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0073s for     8192 events => throughput is 1.12E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3159s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2370s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0789s for    90112 events => throughput is 1.14E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3072s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2282s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0790s for    90112 events => throughput is 1.14E+06 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.095999e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.097760e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.120004e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.119253e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690698914467276] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2909s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2810s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0098s for     8192 events => throughput is 8.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2907s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2807s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0099s for     8192 events => throughput is 8.25E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223780273983500] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4173s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2979s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1195s for    90112 events => throughput is 7.54E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3509s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2397s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1112s for    90112 events => throughput is 8.10E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.668644e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.884299e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.548978e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.701504e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,8 +514,8 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690703397697980] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6943s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6937s
+ [COUNTERS] PROGRAM TOTAL          :    0.6960s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6955s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.51E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6513s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6459s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0054s for    90112 events => throughput is 1.67E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6624s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6571s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0054s for    90112 events => throughput is 1.68E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.266713e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.111635e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.234896e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.880409e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.830084e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.143607e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.762403e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.762374e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.776301e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.140173e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.872477e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.866583e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.374142e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.685718e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.426544e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.400545e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 8bacc65fe8..9b02995ca5 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -1,8 +1,8 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 CUDACPP_BUILDDIR='.'
 
-make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=none
 
 
 make USEBUILDDIR=1 AVX=sse4
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:09:59
+DATE: 2023-11-09_18:27:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 420 events (found 1577 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3627s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3194s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0433s for     8192 events => throughput is 1.89E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3509s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3105s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0405s for     8192 events => throughput is 2.02E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3074s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2660s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0414s for     8192 events => throughput is 1.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3067s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2662s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0405s for     8192 events => throughput is 2.02E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6907s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2365s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4542s for    90112 events => throughput is 1.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6580s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2117s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4462s for    90112 events => throughput is 2.02E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3443s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3074s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0369s for     8192 events => throughput is 2.22E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3460s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3081s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0379s for     8192 events => throughput is 2.16E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6798s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2683s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4115s for    90112 events => throughput is 2.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6700s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2581s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4119s for    90112 events => throughput is 2.19E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.164831e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.182152e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.183670e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.183502e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3143s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2936s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0208s for     8192 events => throughput is 3.95E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3147s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2941s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0207s for     8192 events => throughput is 3.96E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4761s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2466s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2295s for    90112 events => throughput is 3.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4759s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2478s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2281s for    90112 events => throughput is 3.95E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.799865e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.820026e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.756525e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.775419e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2976s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2844s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0132s for     8192 events => throughput is 6.21E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2965s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2837s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0128s for     8192 events => throughput is 6.38E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4201s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2736s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1464s for    90112 events => throughput is 6.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3920s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2488s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1432s for    90112 events => throughput is 6.29E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.181937e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.159361e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.243573e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.220899e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2977s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2865s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0112s for     8192 events => throughput is 7.30E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3061s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2938s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0123s for     8192 events => throughput is 6.65E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3670s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2408s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1262s for    90112 events => throughput is 7.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3693s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2423s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1270s for    90112 events => throughput is 7.10E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.933959e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.912537e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.064349e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.069074e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3083s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2895s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0188s for     8192 events => throughput is 4.35E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3327s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3109s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0218s for     8192 events => throughput is 3.75E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4519s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2484s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2036s for    90112 events => throughput is 4.43E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4629s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2551s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2078s for    90112 events => throughput is 4.34E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.266660e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.077933e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.117226e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.997576e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.69 [47.690708266690699] fbridge_mode=1
  [UNWEIGHT] Wrote 434 events (found 1125 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6949s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6943s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.46E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6985s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6979s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.40E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1
  [UNWEIGHT] Wrote 1727 events (found 1732 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6539s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6476s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for    90112 events => throughput is 1.42E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6617s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6553s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0064s for    90112 events => throughput is 1.42E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.049281e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.060435e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.529307e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.608769e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.148817e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.186491e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.053163e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.059369e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.170472e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.182441e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.130394e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.136921e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.186789e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.174632e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.035076e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.949461e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 09e16e6057..241597d591 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-
 make USEBUILDDIR=1 AVX=none
 
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:10:24
+DATE: 2023-11-09_18:28:15
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 42 events (found 469 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5436s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2280s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3156s for     8192 events => throughput is 2.60E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5556s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2379s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3178s for     8192 events => throughput is 2.58E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5326s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2186s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3141s for     8192 events => throughput is 2.61E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5351s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2203s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3148s for     8192 events => throughput is 2.60E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    4.9133s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4066s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.5067s for    90112 events => throughput is 2.57E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8579s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3886s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.4692s for    90112 events => throughput is 2.60E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470791E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8544s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5319s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3225s for     8192 events => throughput is 2.54E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8596s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5355s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3241s for     8192 events => throughput is 2.53E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.3255s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7008s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6247s for    90112 events => throughput is 2.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.2563s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6842s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5721s for    90112 events => throughput is 2.52E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.590377e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.570949e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.610150e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.596498e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5624s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3861s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1763s for     8192 events => throughput is 4.65E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5542s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3858s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1684s for     8192 events => throughput is 4.87E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    3.3972s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5470s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8502s for    90112 events => throughput is 4.87E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.5019s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5803s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.9216s for    90112 events => throughput is 4.69E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.010592e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.985717e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.958333e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.959096e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3818s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2982s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0836s for     8192 events => throughput is 9.79E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3840s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3011s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0829s for     8192 events => throughput is 9.88E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3684s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4497s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9187s for    90112 events => throughput is 9.81E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3753s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4512s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9241s for    90112 events => throughput is 9.75E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.953639e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.005162e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.002866e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.000723e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3650s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2906s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0744s for     8192 events => throughput is 1.10E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3672s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2918s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0754s for     8192 events => throughput is 1.09E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2634s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4412s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8222s for    90112 events => throughput is 1.10E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2690s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4430s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8260s for    90112 events => throughput is 1.09E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.117525e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111268e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.126876e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.117996e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4269s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3231s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1039s for     8192 events => throughput is 7.89E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4279s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3235s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1044s for     8192 events => throughput is 7.85E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6060s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4689s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1371s for    90112 events => throughput is 7.92E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6406s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4855s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1551s for    90112 events => throughput is 7.80E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.896705e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.832306e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.740238e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.896180e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,8 +514,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6527s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6472s
+ [COUNTERS] PROGRAM TOTAL          :    0.6558s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6503s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for     8192 events => throughput is 1.50E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8560s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8329s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0231s for    90112 events => throughput is 3.90E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8300s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8072s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0228s for    90112 events => throughput is 3.95E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.624902e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.613028e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.902263e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.229609e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.850642e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.871226e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.238047e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.236452e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.868590e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.869896e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.248755e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.247810e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.862444e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.851703e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.745100e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.745705e+06                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index 1a98ebc0f5..9b1af7b411 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
+make USEBUILDDIR=1 AVX=none
 
 
-make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:11:05
+DATE: 2023-11-09_18:28:56
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 42 events (found 469 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5362s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2200s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3162s for     8192 events => throughput is 2.59E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5377s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2218s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3159s for     8192 events => throughput is 2.59E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5340s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2185s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3154s for     8192 events => throughput is 2.60E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5364s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2203s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3161s for     8192 events => throughput is 2.59E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    4.8590s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3887s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.4703s for    90112 events => throughput is 2.60E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.9162s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3985s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.5176s for    90112 events => throughput is 2.56E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196349765248158E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8380s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5255s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3125s for     8192 events => throughput is 2.62E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8412s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5250s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3162s for     8192 events => throughput is 2.59E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310860767768514E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.1166s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6696s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.4470s for    90112 events => throughput is 2.61E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.1769s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6882s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.4887s for    90112 events => throughput is 2.58E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.677117e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.661457e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.693750e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.666467e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196334183509370E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4030s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3096s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0934s for     8192 events => throughput is 8.77E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4080s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3132s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0948s for     8192 events => throughput is 8.64E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310847547651041E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4739s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4457s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0282s for    90112 events => throughput is 8.76E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5043s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4696s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0347s for    90112 events => throughput is 8.71E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.839523e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.800531e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.853955e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.815957e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,8 +286,8 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3025s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2591s
+ [COUNTERS] PROGRAM TOTAL          :    0.3077s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2643s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0434s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8724s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4009s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4715s for    90112 events => throughput is 1.91E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8998s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4206s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4793s for    90112 events => throughput is 1.88E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.919418e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.823286e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.922480e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.826868e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2944s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2562s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0382s for     8192 events => throughput is 2.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2983s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2592s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0391s for     8192 events => throughput is 2.10E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8215s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3936s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4279s for    90112 events => throughput is 2.11E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8383s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4080s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4303s for    90112 events => throughput is 2.09E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.114883e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.101947e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.107711e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.126133e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196344079460428E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3218s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2710s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0508s for     8192 events => throughput is 1.61E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3220s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2717s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0503s for     8192 events => throughput is 1.63E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310857804286998E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9668s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4146s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5522s for    90112 events => throughput is 1.63E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9888s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4251s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5637s for    90112 events => throughput is 1.60E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.619298e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.589248e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.625264e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.587181e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196349366365994E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6443s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6435s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 9.56E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6498s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6490s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 9.66E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310864949473968E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7852s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7757s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0094s for    90112 events => throughput is 9.54E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8143s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8048s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0095s for    90112 events => throughput is 9.51E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.275339e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.303788e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.852966e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.857184e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.672301e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.727610e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.329588e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.358085e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.661199e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.712514e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.474053e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.447022e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.511679e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.573590e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.616407e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.621450e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index b41396f75b..e102a98f20 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-make USEBUILDDIR=1 AVX=none
-
 
+make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:11:42
+DATE: 2023-11-09_18:29:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 42 events (found 469 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5361s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2201s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3160s for     8192 events => throughput is 2.59E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5406s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2214s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3192s for     8192 events => throughput is 2.57E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5352s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2185s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3167s for     8192 events => throughput is 2.59E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5369s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2199s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3170s for     8192 events => throughput is 2.58E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    4.8603s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3884s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.4719s for    90112 events => throughput is 2.60E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8531s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3845s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.4687s for    90112 events => throughput is 2.60E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358763382007E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8721s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5420s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3301s for     8192 events => throughput is 2.48E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8764s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5433s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3331s for     8192 events => throughput is 2.46E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    5.2894s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6845s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6049s for    90112 events => throughput is 2.50E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.3597s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7144s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6453s for    90112 events => throughput is 2.47E+04 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.562016e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.553245e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.546299e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.536593e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358804670396E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5435s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3795s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1640s for     8192 events => throughput is 4.99E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5484s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3827s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1657s for     8192 events => throughput is 4.94E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    3.3591s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5386s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8206s for    90112 events => throughput is 4.95E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.3712s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5426s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8286s for    90112 events => throughput is 4.93E+04 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.765208e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.047917e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.784106e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.047714e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4043s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3156s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0888s for     8192 events => throughput is 9.23E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3884s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3047s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0838s for     8192 events => throughput is 9.78E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3898s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4601s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9297s for    90112 events => throughput is 9.69E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4554s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9273s for    90112 events => throughput is 9.72E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.002689e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.985245e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.001815e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.974556e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3655s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2923s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0732s for     8192 events => throughput is 1.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3689s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2951s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0738s for     8192 events => throughput is 1.11E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2429s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4365s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8064s for    90112 events => throughput is 1.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2643s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4493s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8151s for    90112 events => throughput is 1.11E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.134514e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.067840e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.146843e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.069793e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358757578441E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4312s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3234s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1077s for     8192 events => throughput is 7.60E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4597s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3406s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1191s for     8192 events => throughput is 6.88E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872803699391E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6602s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4811s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1791s for    90112 events => throughput is 7.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7582s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5206s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2377s for    90112 events => throughput is 7.28E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.628154e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.675272e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.726777e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.626790e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.0972 [9.7196358102981245E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 41 events (found 467 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6526s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6472s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0054s for     8192 events => throughput is 1.51E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6588s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6533s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for     8192 events => throughput is 1.50E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 679 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8190s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7961s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0229s for    90112 events => throughput is 3.93E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8293s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8065s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0228s for    90112 events => throughput is 3.95E+06 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.619555e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.635720e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.404025e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.120274e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.847979e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.835173e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.233328e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.231986e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.825056e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.818919e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.244373e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.242590e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.833245e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.805414e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.724277e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.724480e+06                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index e6041006eb..408d8d380a 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -16,14 +16,14 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:12:23
+DATE: 2023-11-09_18:30:15
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 48 events (found 439 events)
- [COUNTERS] PROGRAM TOTAL          :    4.3823s
+ [COUNTERS] PROGRAM TOTAL          :    4.3928s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.2780s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1043s for     8192 events => throughput is 2.00E+03 events/s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1147s for     8192 events => throughput is 1.99E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.3581s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2728s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.0853s for     8192 events => throughput is 2.01E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3846s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2715s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1131s for     8192 events => throughput is 1.99E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   47.0624s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8747s
- [COUNTERS] Fortran MEs      ( 1 ) :   45.1877s for    90112 events => throughput is 1.99E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.4210s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8888s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.5321s for    90112 events => throughput is 1.98E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    8.6032s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.3774s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2258s for     8192 events => throughput is 1.94E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.6565s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4044s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2521s for     8192 events => throughput is 1.93E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   52.5656s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.9752s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   46.5903s for    90112 events => throughput is 1.93E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   52.9600s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.0482s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   46.9118s for    90112 events => throughput is 1.92E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.002618e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.992604e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.000666e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.989276e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.6983s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4517s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.2466s for     8192 events => throughput is 3.65E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.7458s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.4719s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.2739s for     8192 events => throughput is 3.60E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   29.0395s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.1529s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.8866s for    90112 events => throughput is 3.62E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   29.7086s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1354s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   25.5732s for    90112 events => throughput is 3.52E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.775162e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.697279e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.752647e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.704506e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2090s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2291s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9799s for     8192 events => throughput is 8.36E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2161s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2329s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9832s for     8192 events => throughput is 8.33E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   13.6019s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.8333s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   10.7686s for    90112 events => throughput is 8.37E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   13.6675s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.8433s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   10.8242s for    90112 events => throughput is 8.33E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.622945e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.632389e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.637406e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.597678e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9647s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1065s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8583s for     8192 events => throughput is 9.54E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9728s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1137s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8591s for     8192 events => throughput is 9.54E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   12.2465s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7171s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.5294s for    90112 events => throughput is 9.46E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   12.1508s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.7103s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4405s for    90112 events => throughput is 9.55E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.867536e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.863291e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.834174e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.840135e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4062s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3349s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0712s for     8192 events => throughput is 7.65E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5697s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4500s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1197s for     8192 events => throughput is 7.32E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   14.7127s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.9424s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.7703s for    90112 events => throughput is 7.66E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.0835s
+ [COUNTERS] Fortran Overhead ( 0 ) :    3.0464s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.0370s for    90112 events => throughput is 7.49E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.671946e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.677485e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.485706e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.683279e+03                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8073s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7752s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0321s for     8192 events => throughput is 2.55E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8101s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7773s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0327s for     8192 events => throughput is 2.50E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    2.7243s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3746s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3497s for    90112 events => throughput is 2.58E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7514s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3963s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3551s for    90112 events => throughput is 2.54E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.290435e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.285714e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.518069e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.505353e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.109074e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.109677e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.162766e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.147684e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.119359e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.113597e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.170946e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.164951e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.114486e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.106343e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.433160e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.432331e+05                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index a18920ba3f..f4a809f68b 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:16:35
+DATE: 2023-11-09_18:34:30
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 48 events (found 439 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4492s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2726s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1766s for     8192 events => throughput is 1.96E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3944s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2768s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.1176s for     8192 events => throughput is 1.99E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.3607s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2703s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.0903s for     8192 events => throughput is 2.00E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.5146s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2747s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2399s for     8192 events => throughput is 1.93E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   47.0727s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8744s
- [COUNTERS] Fortran MEs      ( 1 ) :   45.1984s for    90112 events => throughput is 1.99E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.3456s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8962s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.4494s for    90112 events => throughput is 1.98E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277396490802749E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    8.3702s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.2240s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1462s for     8192 events => throughput is 1.98E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.3558s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.2546s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1013s for     8192 events => throughput is 2.00E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803774602344628E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   50.9666s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.8905s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   45.0761s for    90112 events => throughput is 2.00E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   51.2827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    5.9515s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   45.3313s for    90112 events => throughput is 1.99E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.075529e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.068073e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.074082e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.068719e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277389126121586E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5244s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3710s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1534s for     8192 events => throughput is 7.10E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4998s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3795s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1203s for     8192 events => throughput is 7.31E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803771887543366E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   15.2999s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.0272s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.2727s for    90112 events => throughput is 7.34E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.4928s
+ [COUNTERS] Fortran Overhead ( 0 ) :    3.0115s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.4813s for    90112 events => throughput is 7.22E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.487987e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.470531e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.461964e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.461238e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2522s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7532s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4990s for     8192 events => throughput is 1.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.2540s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7572s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4968s for     8192 events => throughput is 1.65E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    7.8843s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3862s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4981s for    90112 events => throughput is 1.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.8987s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3780s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.5207s for    90112 events => throughput is 1.63E+04 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.703770e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.671559e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.715659e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.684139e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.1254s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6948s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4306s for     8192 events => throughput is 1.90E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.1397s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7044s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4354s for     8192 events => throughput is 1.88E+04 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    7.0325s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.2899s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.7425s for    90112 events => throughput is 1.90E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.1254s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3176s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.8078s for    90112 events => throughput is 1.87E+04 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.946675e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.932083e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.957212e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.934934e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277396394633404E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3221s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7944s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5278s for     8192 events => throughput is 1.55E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3342s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8031s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5312s for     8192 events => throughput is 1.54E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803777741065333E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    8.1973s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3930s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.8043s for    90112 events => throughput is 1.55E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.3073s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.4189s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.8884s for    90112 events => throughput is 1.53E+04 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.558982e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.547676e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.568288e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.546957e+04                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277400478491260E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7705s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7491s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0214s for     8192 events => throughput is 3.83E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7736s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7522s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0215s for     8192 events => throughput is 3.81E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5805s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3447s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2358s for    90112 events => throughput is 3.82E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5981s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.3628s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2353s for    90112 events => throughput is 3.83E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.598757e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.602414e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.937809e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.925045e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.495923e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.484752e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.725491e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.656642e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.498449e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.490786e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.660457e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.725267e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.473649e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.471712e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.522099e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.530964e+05                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index 05db57554d..9bed8b02d9 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.'
 
 
 
-
 make USEBUILDDIR=1 AVX=none
+
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:19:53
+DATE: 2023-11-09_18:37:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 48 events (found 439 events)
- [COUNTERS] PROGRAM TOTAL          :    4.3676s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2775s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.0901s for     8192 events => throughput is 2.00E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3681s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2752s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.0929s for     8192 events => throughput is 2.00E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4195s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2705s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1489s for     8192 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.3422s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2703s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.0719s for     8192 events => throughput is 2.01E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   47.1152s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8703s
- [COUNTERS] Fortran MEs      ( 1 ) :   45.2450s for    90112 events => throughput is 1.99E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   47.1722s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8864s
+ [COUNTERS] Fortran MEs      ( 1 ) :   45.2857s for    90112 events => throughput is 1.99E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    8.7049s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.4327s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2722s for     8192 events => throughput is 1.92E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.6914s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4356s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2558s for     8192 events => throughput is 1.92E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725813026109E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   53.0960s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.0891s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   47.0069s for    90112 events => throughput is 1.92E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   54.0099s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.0604s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   47.9495s for    90112 events => throughput is 1.88E+03 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.971437e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.955214e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.965809e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.962469e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277430934464E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    4.7042s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4800s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.2242s for     8192 events => throughput is 3.68E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.7696s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.4653s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3043s for     8192 events => throughput is 3.56E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725816246317E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   28.5105s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0554s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.4551s for    90112 events => throughput is 3.68E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   28.7487s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.0795s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.6692s for    90112 events => throughput is 3.65E+03 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.800834e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.767280e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.788503e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.771152e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.1858s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2226s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9633s for     8192 events => throughput is 8.50E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.1933s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2225s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9708s for     8192 events => throughput is 8.44E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   13.5514s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.8252s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   10.7262s for    90112 events => throughput is 8.40E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   13.6387s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.8343s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   10.8044s for    90112 events => throughput is 8.34E+03 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.756273e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.765902e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.759413e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.708316e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9510s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.0980s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8530s for     8192 events => throughput is 9.60E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9610s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.1075s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8536s for     8192 events => throughput is 9.60E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   12.1748s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7107s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4641s for    90112 events => throughput is 9.52E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   12.2117s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.7290s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4827s for    90112 events => throughput is 9.50E+03 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.859146e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.837213e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.890303e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.813722e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4412s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3447s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0965s for     8192 events => throughput is 7.47E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4206s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3396s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0809s for     8192 events => throughput is 7.58E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :   14.7703s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.9437s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.8266s for    90112 events => throughput is 7.62E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   14.8540s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.9517s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9023s for    90112 events => throughput is 7.57E+03 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.668015e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.664729e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.694387e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.661148e+03                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.0003628 [3.6277277293084707E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 59 events (found 420 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8048s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7728s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0320s for     8192 events => throughput is 2.56E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8068s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7745s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0323s for     8192 events => throughput is 2.54E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 2
  [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1
  [UNWEIGHT] Wrote 207 events (found 1235 events)
- [COUNTERS] PROGRAM TOTAL          :    2.7246s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3746s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3499s for    90112 events => throughput is 2.58E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7640s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.4053s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3587s for    90112 events => throughput is 2.51E+05 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.280245e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.297023e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.525176e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.536170e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.116522e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.107408e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.157499e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.153471e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.119956e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.118088e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.172287e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.176343e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.122850e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.120562e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.440669e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.436751e+05                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index b972c40fa5..635bc8aab0 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.'
 
 
 make USEBUILDDIR=1 AVX=none
-make USEBUILDDIR=1 AVX=sse4
 
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:25:30
+DATE: 2023-11-09_18:43:28
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 166 events)
- [COUNTERS] PROGRAM TOTAL          :   95.8408s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4545s
- [COUNTERS] Fortran MEs      ( 1 ) :   95.3863s for     8192 events => throughput is 8.59E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   96.1979s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4594s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.7384s for     8192 events => throughput is 8.56E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   95.5040s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4512s
- [COUNTERS] Fortran MEs      ( 1 ) :   95.0528s for     8192 events => throughput is 8.62E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   96.1938s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4572s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.7366s for     8192 events => throughput is 8.56E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          : 1050.5151s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.1583s
- [COUNTERS] Fortran MEs      ( 1 ) : 1046.3568s for    90112 events => throughput is 8.61E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1056.1191s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1851s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1051.9341s for    90112 events => throughput is 8.57E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :  216.0448s
- [COUNTERS] Fortran Overhead ( 0 ) :   99.5423s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  116.5025s for     8192 events => throughput is 7.03E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  221.2522s
+ [COUNTERS] Fortran Overhead ( 0 ) :  101.5022s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  119.7500s for     8192 events => throughput is 6.84E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813953E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          : 1395.0826s
- [COUNTERS] Fortran Overhead ( 0 ) :  101.4573s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1293.6254s for    90112 events => throughput is 6.97E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1409.0435s
+ [COUNTERS] Fortran Overhead ( 0 ) :   99.0565s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1309.9869s for    90112 events => throughput is 6.88E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.294341e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.535302e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.275454e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.232167e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :  107.3938s
- [COUNTERS] Fortran Overhead ( 0 ) :   49.4703s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   57.9235s for     8192 events => throughput is 1.41E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  107.7463s
+ [COUNTERS] Fortran Overhead ( 0 ) :   49.5074s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   58.2390s for     8192 events => throughput is 1.41E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  689.6088s
- [COUNTERS] Fortran Overhead ( 0 ) :   53.6676s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  635.9412s for    90112 events => throughput is 1.42E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  695.6110s
+ [COUNTERS] Fortran Overhead ( 0 ) :   53.4125s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  642.1984s for    90112 events => throughput is 1.40E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.663387e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.667754e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.670748e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.672792e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   50.5726s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.0971s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   27.4754s for     8192 events => throughput is 2.98E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   50.7441s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.3520s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   27.3921s for     8192 events => throughput is 2.99E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  326.9697s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.6301s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  300.3396s for    90112 events => throughput is 3.00E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  331.0298s
+ [COUNTERS] Fortran Overhead ( 0 ) :   27.1582s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  303.8716s for    90112 events => throughput is 2.97E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.612820e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.602735e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.630261e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.607119e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   44.4764s
- [COUNTERS] Fortran Overhead ( 0 ) :   20.3120s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.1644s for     8192 events => throughput is 3.39E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   44.2409s
+ [COUNTERS] Fortran Overhead ( 0 ) :   20.3557s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.8852s for     8192 events => throughput is 3.43E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  289.1902s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.9124s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  265.2778s for    90112 events => throughput is 3.40E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  289.3981s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.9732s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  265.4249s for    90112 events => throughput is 3.40E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.088132e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.111160e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.127446e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.141844e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   45.6965s
- [COUNTERS] Fortran Overhead ( 0 ) :   22.1825s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   23.5139s for     8192 events => throughput is 3.48E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   45.6199s
+ [COUNTERS] Fortran Overhead ( 0 ) :   22.4059s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.2139s for     8192 events => throughput is 3.53E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  283.5251s
- [COUNTERS] Fortran Overhead ( 0 ) :   25.9112s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  257.6139s for    90112 events => throughput is 3.50E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  283.6130s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.2046s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  257.4085s for    90112 events => throughput is 3.50E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.741805e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.763228e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.777930e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.741992e+02                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435838E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :    4.1875s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.1069s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0806s for     8192 events => throughput is 7.58E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.1979s
+ [COUNTERS] Fortran Overhead ( 0 ) :    3.1190s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0789s for     8192 events => throughput is 7.59E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :   18.7118s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.8168s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.8950s for    90112 events => throughput is 7.58E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   18.6565s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.7674s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.8891s for    90112 events => throughput is 7.58E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.523661e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.527117e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.283120e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.256112e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.266218e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.240392e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.591927e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.568765e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.251570e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.279873e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.476794e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.441727e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.262349e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.268118e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.252080e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.240204e+03                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index 3ca211fa85..9a7b15ddba 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.'
 
 
 
-
 make USEBUILDDIR=1 AVX=none
+
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_23:51:54
+DATE: 2023-11-09_20:10:26
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 166 events)
- [COUNTERS] PROGRAM TOTAL          :   95.6648s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4545s
- [COUNTERS] Fortran MEs      ( 1 ) :   95.2103s for     8192 events => throughput is 8.60E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   95.6517s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4537s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.1980s for     8192 events => throughput is 8.61E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   95.3879s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4521s
- [COUNTERS] Fortran MEs      ( 1 ) :   94.9358s for     8192 events => throughput is 8.63E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   95.5775s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4538s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.1237s for     8192 events => throughput is 8.61E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          : 1051.3512s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.1998s
- [COUNTERS] Fortran MEs      ( 1 ) : 1047.1514s for    90112 events => throughput is 8.61E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1055.1274s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1731s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1050.9543s for    90112 events => throughput is 8.57E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694768344939596E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :  195.2840s
- [COUNTERS] Fortran Overhead ( 0 ) :   89.6572s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  105.6269s for     8192 events => throughput is 7.76E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  198.8691s
+ [COUNTERS] Fortran Overhead ( 0 ) :   90.2534s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  108.6157s for     8192 events => throughput is 7.54E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361436150871156E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          : 1253.2021s
- [COUNTERS] Fortran Overhead ( 0 ) :   93.4786s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1159.7235s for    90112 events => throughput is 7.77E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1275.3669s
+ [COUNTERS] Fortran Overhead ( 0 ) :   93.9491s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1181.4178s for    90112 events => throughput is 7.63E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.188520e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.083570e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.207566e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.167448e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694765850750953E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   48.9590s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.2330s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.7260s for     8192 events => throughput is 3.18E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   49.8398s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.4099s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.4299s for     8192 events => throughput is 3.10E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361430669586527E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  312.4727s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.8498s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  285.6229s for    90112 events => throughput is 3.15E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  320.3836s
+ [COUNTERS] Fortran Overhead ( 0 ) :   26.9904s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  293.3932s for    90112 events => throughput is 3.07E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.595667e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.524011e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.615224e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.562557e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   25.4046s
- [COUNTERS] Fortran Overhead ( 0 ) :   11.8022s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   13.6023s for     8192 events => throughput is 6.02E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   25.3018s
+ [COUNTERS] Fortran Overhead ( 0 ) :   11.8221s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   13.4798s for     8192 events => throughput is 6.08E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  164.6743s
- [COUNTERS] Fortran Overhead ( 0 ) :   15.5764s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  149.0979s for    90112 events => throughput is 6.04E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  161.8530s
+ [COUNTERS] Fortran Overhead ( 0 ) :   15.4501s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  146.4028s for    90112 events => throughput is 6.16E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.233727e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.213869e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.144603e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.163477e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   22.4388s
- [COUNTERS] Fortran Overhead ( 0 ) :   10.5095s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9293s for     8192 events => throughput is 6.87E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   22.2497s
+ [COUNTERS] Fortran Overhead ( 0 ) :   10.3581s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.8916s for     8192 events => throughput is 6.89E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  145.9227s
- [COUNTERS] Fortran Overhead ( 0 ) :   13.9719s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  131.9508s for    90112 events => throughput is 6.83E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  144.5243s
+ [COUNTERS] Fortran Overhead ( 0 ) :   14.0601s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  130.4642s for    90112 events => throughput is 6.91E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.277686e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.261245e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.316223e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.179572e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694767957195604E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   22.8899s
- [COUNTERS] Fortran Overhead ( 0 ) :   11.3435s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.5464s for     8192 events => throughput is 7.09E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   22.8272s
+ [COUNTERS] Fortran Overhead ( 0 ) :   11.2607s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.5665s for     8192 events => throughput is 7.08E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361435956349820E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  142.7065s
- [COUNTERS] Fortran Overhead ( 0 ) :   14.9424s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  127.7641s for    90112 events => throughput is 7.05E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  143.3402s
+ [COUNTERS] Fortran Overhead ( 0 ) :   14.9961s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  128.3441s for    90112 events => throughput is 7.02E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.537880e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.537594e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.497574e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.456699e+02                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1694770708195000E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4801s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9879s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4922s for     8192 events => throughput is 1.66E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4571s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9676s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4895s for     8192 events => throughput is 1.67E+04 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1361443477565659E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :   11.0377s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.5836s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4541s for    90112 events => throughput is 1.65E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :   11.0626s
+ [COUNTERS] Fortran Overhead ( 0 ) :    5.6077s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4549s for    90112 events => throughput is 1.65E+04 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.639292e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.640892e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.626171e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.619412e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.329585e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.340657e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.369301e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.426283e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.304460e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.326049e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.376586e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.360046e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.333260e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.341201e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.421151e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.441486e+03                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 2729351c42..e947131942 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -1,11 +1,11 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 CUDACPP_BUILDDIR='.'
 
-make USEBUILDDIR=1 AVX=none
 
 
-make USEBUILDDIR=1 AVX=sse4
+make USEBUILDDIR=1 AVX=none
 
+make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
-CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-09_00:57:00
+DATE: 2023-11-09_21:16:08
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 166 events)
- [COUNTERS] PROGRAM TOTAL          :   95.3917s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4564s
- [COUNTERS] Fortran MEs      ( 1 ) :   94.9352s for     8192 events => throughput is 8.63E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   95.6107s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4599s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.1508s for     8192 events => throughput is 8.61E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   95.2404s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4495s
- [COUNTERS] Fortran MEs      ( 1 ) :   94.7909s for     8192 events => throughput is 8.64E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :   95.5844s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4540s
+ [COUNTERS] Fortran MEs      ( 1 ) :   95.1304s for     8192 events => throughput is 8.61E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          : 1049.6483s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.1482s
- [COUNTERS] Fortran MEs      ( 1 ) : 1045.5001s for    90112 events => throughput is 8.62E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1052.2893s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.1570s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1048.1323s for    90112 events => throughput is 8.60E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101016896846E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :  223.2377s
- [COUNTERS] Fortran Overhead ( 0 ) :  102.8564s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  120.3813s for     8192 events => throughput is 6.81E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  223.0748s
+ [COUNTERS] Fortran Overhead ( 0 ) :  103.3973s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  119.6775s for     8192 events => throughput is 6.85E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          : 1425.5713s
- [COUNTERS] Fortran Overhead ( 0 ) :  106.5194s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1319.0519s for    90112 events => throughput is 6.83E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1425.4469s
+ [COUNTERS] Fortran Overhead ( 0 ) :  107.1167s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1318.3302s for    90112 events => throughput is 6.84E+01 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.033155e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.990567e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.028364e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.033316e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :  110.1179s
- [COUNTERS] Fortran Overhead ( 0 ) :   50.7873s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   59.3305s for     8192 events => throughput is 1.38E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  112.1583s
+ [COUNTERS] Fortran Overhead ( 0 ) :   51.1368s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   61.0216s for     8192 events => throughput is 1.34E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436284111598E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  704.2691s
- [COUNTERS] Fortran Overhead ( 0 ) :   54.2949s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  649.9742s for    90112 events => throughput is 1.39E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  719.1467s
+ [COUNTERS] Fortran Overhead ( 0 ) :   54.6964s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  664.4503s for    90112 events => throughput is 1.36E+02 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.635297e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.625730e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.628042e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.622146e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   48.2204s
- [COUNTERS] Fortran Overhead ( 0 ) :   21.9374s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   26.2831s for     8192 events => throughput is 3.12E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   48.7268s
+ [COUNTERS] Fortran Overhead ( 0 ) :   22.2016s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.5252s for     8192 events => throughput is 3.09E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  314.3646s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.1162s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  288.2484s for    90112 events => throughput is 3.13E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  312.7787s
+ [COUNTERS] Fortran Overhead ( 0 ) :   25.8939s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  286.8848s for    90112 events => throughput is 3.14E+02 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.810528e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.761983e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.825565e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.775859e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   42.6054s
- [COUNTERS] Fortran Overhead ( 0 ) :   19.4149s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   23.1905s for     8192 events => throughput is 3.53E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   42.1739s
+ [COUNTERS] Fortran Overhead ( 0 ) :   19.2356s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   22.9383s for     8192 events => throughput is 3.57E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  278.0352s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.0285s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  255.0067s for    90112 events => throughput is 3.53E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  277.3137s
+ [COUNTERS] Fortran Overhead ( 0 ) :   23.0478s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  254.2659s for    90112 events => throughput is 3.54E+02 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.372569e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.346725e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.390556e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.360141e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :   44.8365s
- [COUNTERS] Fortran Overhead ( 0 ) :   21.9299s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   22.9066s for     8192 events => throughput is 3.58E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   45.3760s
+ [COUNTERS] Fortran Overhead ( 0 ) :   21.9554s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.4206s for     8192 events => throughput is 3.50E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :  280.1799s
- [COUNTERS] Fortran Overhead ( 0 ) :   25.4637s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  254.7162s for    90112 events => throughput is 3.54E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  283.3743s
+ [COUNTERS] Fortran Overhead ( 0 ) :   25.7277s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  257.6465s for    90112 events => throughput is 3.50E+02 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.829822e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.787133e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.840554e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.796022e+02                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 15 events (found 163 events)
- [COUNTERS] PROGRAM TOTAL          :    3.5385s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.6761s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8624s for     8192 events => throughput is 9.50E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.5891s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.7218s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8672s for     8192 events => throughput is 9.45E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 84 events (found 808 events)
- [COUNTERS] PROGRAM TOTAL          :   15.7972s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.3222s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4751s for    90112 events => throughput is 9.51E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.8181s
+ [COUNTERS] Fortran Overhead ( 0 ) :    6.3338s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.4843s for    90112 events => throughput is 9.50E+03 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.416746e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.489325e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.082101e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.086868e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111361e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.112402e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.159067e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.163573e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.107992e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.112546e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.110248e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.110187e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.116277e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.113455e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.631653e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.651684e+03                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index a53e3fae12..17d6db3749 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 CUDACPP_BUILDDIR='.'
 
 
-
-
 make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
+
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'make[1]: Nothing to be done for 'all'.
-
+CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0'
+make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+CUDACPP_BUILDDIR='build.none_d_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:24:05
+DATE: 2023-11-09_18:42:02
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 78 events (found 561 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3033s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2340s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0693s for     8192 events => throughput is 1.18E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3065s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2361s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0704s for     8192 events => throughput is 1.16E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3022s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2323s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0700s for     8192 events => throughput is 1.17E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2994s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2293s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0701s for     8192 events => throughput is 1.17E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.1700s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4093s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.7607s for    90112 events => throughput is 1.18E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.1760s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4125s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7635s for    90112 events => throughput is 1.18E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3843s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3087s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0756s for     8192 events => throughput is 1.08E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3841s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3081s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0760s for     8192 events => throughput is 1.08E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3194s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4974s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8220s for    90112 events => throughput is 1.10E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3472s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5183s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8289s for    90112 events => throughput is 1.09E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.094809e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.089572e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.102064e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.081996e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,8 +210,8 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3132s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2728s
+ [COUNTERS] PROGRAM TOTAL          :    0.3165s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2761s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0404s for     8192 events => throughput is 2.03E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9124s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4682s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4441s for    90112 events => throughput is 2.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9307s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4770s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4537s for    90112 events => throughput is 1.99E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.028339e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.997353e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.046734e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.027039e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2792s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2558s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0234s for     8192 events => throughput is 3.51E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2805s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2572s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0233s for     8192 events => throughput is 3.52E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6989s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4448s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2540s for    90112 events => throughput is 3.55E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7189s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4610s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2579s for    90112 events => throughput is 3.49E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.552356e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.495576e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.523608e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.465419e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2748s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2536s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0212s for     8192 events => throughput is 3.86E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2772s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2561s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0211s for     8192 events => throughput is 3.89E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6795s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4499s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2296s for    90112 events => throughput is 3.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6775s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4482s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2293s for    90112 events => throughput is 3.93E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.842884e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.760921e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.986906e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.978083e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2949s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2643s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0307s for     8192 events => throughput is 2.67E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2977s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2665s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0311s for     8192 events => throughput is 2.63E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8124s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4679s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3444s for    90112 events => throughput is 2.62E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8099s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4686s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3413s for    90112 events => throughput is 2.64E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.637628e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.568787e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.616200e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.561174e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6561s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6555s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.21E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6636s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6629s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.19E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,8 +547,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8543s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8466s
+ [COUNTERS] PROGRAM TOTAL          :    1.8698s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8622s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0076s for    90112 events => throughput is 1.18E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.567103e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.555687e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.093360e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.006338e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.536245e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.515172e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.495821e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.526258e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.517486e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.533570e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.749421e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.783496e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.528020e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.532375e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.773747e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.774257e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 8d2e1984e4..a15824491a 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.'
 
 
 
-
 make USEBUILDDIR=1 AVX=none
 make USEBUILDDIR=1 AVX=sse4
+
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
 
@@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
-CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_f_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:24:34
+DATE: 2023-11-09_18:42:31
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 78 events (found 561 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3107s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2407s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0700s for     8192 events => throughput is 1.17E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3036s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2341s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0695s for     8192 events => throughput is 1.18E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2965s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2276s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0689s for     8192 events => throughput is 1.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3003s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2303s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0699s for     8192 events => throughput is 1.17E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.1583s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4006s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.7577s for    90112 events => throughput is 1.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2069s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4340s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7729s for    90112 events => throughput is 1.17E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050316058770007] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3794s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3064s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0730s for     8192 events => throughput is 1.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3749s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3033s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0716s for     8192 events => throughput is 1.14E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182797520666] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5714s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7649s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8065s for    90112 events => throughput is 1.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2764s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4961s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7803s for    90112 events => throughput is 1.15E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.157942e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.160144e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.172513e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.172915e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050313133963987] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2818s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2568s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0250s for     8192 events => throughput is 3.28E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2845s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2592s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0253s for     8192 events => throughput is 3.23E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801179276862181] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7271s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4513s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2758s for    90112 events => throughput is 3.27E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7355s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4565s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2790s for    90112 events => throughput is 3.23E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.237957e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.194415e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.272249e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.097783e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2581s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2459s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0121s for     8192 events => throughput is 6.75E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2583s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2455s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0128s for     8192 events => throughput is 6.41E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5730s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4380s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1350s for    90112 events => throughput is 6.67E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5855s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4486s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1369s for    90112 events => throughput is 6.58E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.530818e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.397086e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.313362e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.385448e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2561s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2447s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0114s for     8192 events => throughput is 7.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2587s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2472s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0114s for     8192 events => throughput is 7.16E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5576s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4335s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1240s for    90112 events => throughput is 7.27E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5778s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4506s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1272s for    90112 events => throughput is 7.09E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.360354e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.864944e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.523552e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.826763e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2804s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2629s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0176s for     8192 events => throughput is 4.66E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2685s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2527s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0158s for     8192 events => throughput is 5.17E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6820s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4993s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1827s for    90112 events => throughput is 4.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6231s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4511s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1720s for    90112 events => throughput is 5.24E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.733153e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.932364e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.992885e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.764394e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6547s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6542s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.60E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6586s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6581s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.57E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8561s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8501s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0059s for    90112 events => throughput is 1.53E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9395s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9332s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for    90112 events => throughput is 1.43E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.584146e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.830948e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.491850e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.471030e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.856033e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.130497e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.715106e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.724199e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.884678e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.113825e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.799322e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.756435e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.441795e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.594258e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.896004e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.959495e+07                 )  sec^-1
 
 TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index 19ad35f402..3468beddc5 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.'
 
 
 
-
 make USEBUILDDIR=1 AVX=none
+
 make USEBUILDDIR=1 AVX=sse4
 make USEBUILDDIR=1 AVX=avx2
 make USEBUILDDIR=1 AVX=512y
@@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.none_m_inl0_hrd0'
 CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0'
+CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2023-11-08_22:25:01
+DATE: 2023-11-09_18:42:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 78 events (found 561 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3005s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2310s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0695s for     8192 events => throughput is 1.18E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3047s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2346s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0701s for     8192 events => throughput is 1.17E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3006s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2306s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0700s for     8192 events => throughput is 1.17E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3065s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2359s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0705s for     8192 events => throughput is 1.16E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.1678s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4091s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.7587s for    90112 events => throughput is 1.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.2175s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4409s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7766s for    90112 events => throughput is 1.16E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3817s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3071s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0746s for     8192 events => throughput is 1.10E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3854s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3097s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0757s for     8192 events => throughput is 1.08E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182636608796] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3333s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5088s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8245s for    90112 events => throughput is 1.09E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3546s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5224s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8322s for    90112 events => throughput is 1.08E+05 events/s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.076052e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.083780e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.093800e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.087409e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333282657201] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3088s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2701s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0388s for     8192 events => throughput is 2.11E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3153s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2754s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0399s for     8192 events => throughput is 2.05E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182636608810] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8975s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4677s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4298s for    90112 events => throughput is 2.10E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9817s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5338s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4479s for    90112 events => throughput is 2.01E+05 events/s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.015345e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.021169e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.988560e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.048865e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2819s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2586s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0233s for     8192 events => throughput is 3.52E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2833s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2602s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0231s for     8192 events => throughput is 3.54E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7744s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5086s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2658s for    90112 events => throughput is 3.39E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7176s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4613s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2564s for    90112 events => throughput is 3.52E+05 events/s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.485253e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.495609e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.534003e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.519650e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2862s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2651s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0211s for     8192 events => throughput is 3.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2747s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2542s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0205s for     8192 events => throughput is 3.99E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6779s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4535s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2245s for    90112 events => throughput is 4.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6910s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4668s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2243s for    90112 events => throughput is 4.02E+05 events/s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.974625e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.857183e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.057698e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.991341e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2985s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2664s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0320s for     8192 events => throughput is 2.56E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3063s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2736s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0327s for     8192 events => throughput is 2.50E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8085s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4572s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3513s for    90112 events => throughput is 2.56E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8379s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.4792s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3586s for    90112 events => throughput is 2.51E+05 events/s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.330681e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.546786e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.533534e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.503592e+05                 )  sec^-1
 
 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2605 [0.26050333301029693] fbridge_mode=1
  [UNWEIGHT] Wrote 81 events (found 540 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6572s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6565s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.23E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6613s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6607s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.22E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.218 [0.21801182637219935] fbridge_mode=1
  [UNWEIGHT] Wrote 853 events (found 1849 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8718s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8641s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0077s for    90112 events => throughput is 1.17E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8739s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8663s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0076s for    90112 events => throughput is 1.19E+07 events/s
 
 *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.553454e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.582711e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.988956e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.041620e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.533250e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.534455e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.514727e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.524256e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.523754e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.513154e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.800142e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.797491e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.530148e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.528865e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.776434e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.779970e+07                 )  sec^-1
 
 TEST COMPLETED