Skip to content

Commit

Permalink
vbf bdt vars
Browse files Browse the repository at this point in the history
  • Loading branch information
rkansal47 committed Apr 4, 2024
1 parent 28b760c commit 2e7f2b3
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 24 deletions.
6 changes: 6 additions & 0 deletions src/HHbbVV/postprocessing/BDTPreProcessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
# "bbFatJetPtOverDijetPt", # no improvement on BDT
"VVFatJetPtOverDijetPt",
"VVFatJetPtOverbbFatJetPt",
"DijetdEta",
"DijetdPhi",
"vbf_Mass_jj",
"vbf_dEta_jj",
"finalWeight",
]

Expand Down Expand Up @@ -99,6 +103,8 @@ def save_bdt_data(
events["Dataset"] = key
bdt_events_dict.append(events)

print("Saving BDT data to", out_file)

bdt_events = pd.concat(bdt_events_dict, axis=0)
table = pa.Table.from_pandas(bdt_events)
pq.write_table(table, out_file)
Expand Down
30 changes: 22 additions & 8 deletions src/HHbbVV/postprocessing/TrainBDT.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@
"VVFatJetPt",
"VVFatJetPtOverbbFatJetPt",
"MET_pt",
"DijetdEta",
"DijetdPhi", # TODO: current dPhi is buggy
"vbf_Mass_jj",
"vbf_dEta_jj",
]


Expand All @@ -89,16 +93,16 @@

# ignore bins
var_label_map = {
"MET_pt": ([50, 0, 250], r"$p^{miss}_T$ (GeV)"),
"MET_pt": ([50, 0, 250], r"$p^{miss}_T$"),
"DijetEta": ([50, -8, 8], r"$\eta^{jj}$"),
"DijetPt": ([50, 0, 750], r"$p_T^{jj}$ (GeV)"),
"DijetMass": ([50, 500, 3000], r"$m^{jj}$ (GeV)"),
"DijetPt": ([50, 0, 750], r"$p_T^{jj}$"),
"DijetMass": ([50, 500, 3000], r"$m^{jj}$"),
"bbFatJetEta": ([50, -2.4, 2.4], r"$\eta^{bb}$"),
"bbFatJetPt": ([50, 300, 1300], r"$p^{bb}_T$ (GeV)"),
"bbFatJetPt": ([50, 300, 1300], r"$p^{bb}_T$"),
"VVFatJetEta": ([50, -2.4, 2.4], r"$\eta^{VV}$"),
"VVFatJetPt": ([50, 300, 1300], r"$p^{VV}_T$ (GeV)"),
"VVFatJetParticleNetMass": ([50, 0, 300], r"$m^{VV}_{reg}$ (GeV)"),
# "VVFatJetMsd": ([50, 0, 300], r"$m^{VV}_{msd}$ (GeV)"),
"VVFatJetPt": ([50, 300, 1300], r"$p^{VV}_T$"),
"VVFatJetParticleNetMass": ([50, 0, 300], r"$m^{VV}_{reg}$"),
# "VVFatJetMsd": ([50, 0, 300], r"$m^{VV}_{msd}$"),
"VVFatJetParTMD_THWWvsT": ([50, 0, 1], r"ParT $T_{HWW}$"),
"VVFatJetParTMD_probT": ([50, 0, 1], r"ParT $Prob(Top)^{VV}$"),
"VVFatJetParTMD_probQCD": ([50, 0, 1], r"ParT $Prob(QCD)^{VV}$"),
Expand All @@ -107,6 +111,10 @@
"bbFatJetPtOverDijetPt": ([50, 0, 40], r"$p^{bb}_T / p_T^{jj}$"),
"VVFatJetPtOverDijetPt": ([50, 0, 40], r"$p^{VV}_T / p_T^{jj}$"),
"VVFatJetPtOverbbFatJetPt": ([50, 0.4, 2.0], r"$p^{VV}_T / p^{bb}_T$"),
"DijetdEta": ([50, 0, 8], r"$\Delta\eta^{jj}$"),
"DijetdPhi": ([50, 0, 8], r"$\Delta\varphi^{jj}$"),
"vbf_Mass_jj": ([50, 0, 5000], r"$m^{jj}_{VBF}$"),
"vbf_dEta_jj": ([50, 0, 8], r"$\Delta\eta^{jj}_{VBF}$"),
}


Expand Down Expand Up @@ -828,11 +836,17 @@ def do_inference(
"Equalise each backgrounds' weights too",
default=False,
)

"""
VERY minor improvement using this:
https://hhbbvv.nrp-nautilus.io/bdt/24_04_03_k2v0_training_bf/rocs_test/roc.pdf
https://hhbbvv.nrp-nautilus.io/bdt/24_04_03_k2v0_training_eqsig_bf/rocs_test/roc.pdf
"""
add_bool_arg(
parser,
"equalize-sig-total",
"Total signal = total bg, rather than each signal's total equals the total background (only matters for multiple signals)",
default=False,
default=True,
)

parser.add_argument(
Expand Down
88 changes: 72 additions & 16 deletions src/HHbbVV/postprocessing/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,21 +620,25 @@ def _init(args):
# adds all necessary columns to dataframes from events_dict
def _add_nonres_columns(df, bb_mask, vbf_vars=False, ptlabel="", mlabel=""):
"""Variables needed for ggF and/or VBF BDTs"""
# import time

import vector

# start = time.time()

bbJet = utils.make_vector(df, "bbFatJet", bb_mask, ptlabel=ptlabel, mlabel=mlabel)
VVJet = utils.make_vector(df, "VVFatJet", bb_mask, ptlabel=ptlabel, mlabel=mlabel)
Dijet = bbJet + VVJet

# print(f"Time to make vectors: {time.time() - start:.2f}")

if f"DijetMass{ptlabel}{mlabel}" not in df.columns:
df[f"DijetMass{ptlabel}{mlabel}"] = Dijet.mass
df[f"DijetPt{ptlabel}{mlabel}"] = Dijet.pt
df[f"VVFatJetPtOverbbFatJetPt{ptlabel}{mlabel}"] = VVJet.pt / bbJet.pt
df[f"VVFatJetPtOverDijetPt{ptlabel}{mlabel}"] = VVJet.pt / df[f"DijetPt{ptlabel}{mlabel}"]

if not vbf_vars:
return

import vector
# print(f"AK8 jet vars: {time.time() - start:.2f}")

vbf1 = vector.array(
{
Expand All @@ -656,6 +660,20 @@ def _add_nonres_columns(df, bb_mask, vbf_vars=False, ptlabel="", mlabel=""):

jj = vbf1 + vbf2

if "DijetdEta" not in df.columns:
df["DijetdEta"] = np.abs(bbJet.eta - VVJet.eta)
if "DijetdPhi" not in df.columns:
df["DijetdPhi"] = np.abs(bbJet.delta_phi(VVJet))
if f"vbf_Mass_jj{ptlabel}" not in df.columns:
df[f"vbf_Mass_jj{ptlabel}"] = jj.M
if "vbf_dEta_jj" not in df.columns:
df["vbf_dEta_jj"] = np.abs(vbf1.eta - vbf2.eta)

# print(f"VBF jet vars: {time.time() - start:.2f}")

if not vbf_vars:
return

# Adapted from HIG-20-005 ggF_Killer 6.2.2
# https://coffeateam.github.io/coffea/api/coffea.nanoevents.methods.vector.PtEtaPhiMLorentzVector.html
# https://coffeateam.github.io/coffea/api/coffea.nanoevents.methods.vector.LorentzVector.html
Expand All @@ -674,15 +692,15 @@ def _add_nonres_columns(df, bb_mask, vbf_vars=False, ptlabel="", mlabel=""):
df["vbf_dR_j1_Hbb"] = vbf2.deltaR(bbJet)
if "vbf_dR_jj" not in df.columns:
df["vbf_dR_jj"] = vbf1.deltaR(vbf2)
if "vbf_Mass_jj{ptlabel}" not in df.columns:
if f"vbf_Mass_jj{ptlabel}" not in df.columns:
df[f"vbf_Mass_jj{ptlabel}"] = jj.M
if "vbf_dEta_jj" not in df.columns:
df["vbf_dEta_jj"] = np.abs(vbf1.eta - vbf2.eta)

if "DijetdEta" not in df.columns:
df["DijetdEta"] = np.abs(bbJet.eta - VVJet.eta)
if "DijetdPhi" not in df.columns:
df["DijetdPhi"] = np.abs(bbJet.phi - VVJet.phi)
df["DijetdPhi"] = np.abs(bbJet.delta_phi(VVJet))

# Subleading VBF-jet cos(θ) in the HH+2j center of mass frame:
# https://github.com/scikit-hep/vector/blob/main/src/vector/_methods.py#L916
Expand Down Expand Up @@ -1184,6 +1202,7 @@ def load_bdt_preds(
bdt_preds = np.load(f"{bdt_preds_dir}/{year}/preds.npy")

multiclass = len(bdt_preds.shape) > 1
multisig = len(bdt_preds.shape) > 3

if jec_jmsr_shifts:
shift_preds = {
Expand All @@ -1202,22 +1221,59 @@ def load_bdt_preds(
if not multiclass:
events["BDTScore"] = bdt_preds[i : i + num_events]
else:
events["BDTScore"] = bdt_preds[i : i + num_events, 0]
events["BDTScoreQCD"] = bdt_preds[i : i + num_events, 1]
events["BDTScoreTT"] = bdt_preds[i : i + num_events, 2]
events["BDTScoreZJets"] = 1 - np.sum(bdt_preds[i : i + num_events], axis=1)
if multisig:
num_sigs = 2
bg_tot = np.sum(bdt_preds[i : i + num_events][num_sigs:], axis=1)
ggf_score = bdt_preds[i : i + num_events, 0]
vbf_score = bdt_preds[i : i + num_events, 1]

events["BDTScore"] = ggf_score / (ggf_score + bg_tot)
events["BDTScoreVBF"] = vbf_score / (vbf_score + bg_tot)
events["BDTScoreQCD"] = bdt_preds[i : i + num_events, num_sigs]
events["BDTScoreTT"] = bdt_preds[i : i + num_events, num_sigs + 1]
events["BDTScoreZjets"] = bdt_preds[i : i + num_events, num_sigs + 2]
else:
events["BDTScore"] = bdt_preds[i : i + num_events, 0]
events["BDTScoreQCD"] = bdt_preds[i : i + num_events, 1]
events["BDTScoreTT"] = bdt_preds[i : i + num_events, 2]
events["BDTScoreZJets"] = 1 - np.sum(bdt_preds[i : i + num_events], axis=1)

if jec_jmsr_shifts and sample != data_key:
for jshift in jec_shifts + jmsr_shifts:
if not multiclass:
events["BDTScore_" + jshift] = shift_preds[jshift][i : i + num_events]
else:
events["BDTScore_" + jshift] = shift_preds[jshift][i : i + num_events, 0]
events["BDTScoreQCD_" + jshift] = shift_preds[jshift][i : i + num_events, 1]
events["BDTScoreTT_" + jshift] = shift_preds[jshift][i : i + num_events, 2]
events["BDTScoreZJets_" + jshift] = 1 - np.sum(
shift_preds[jshift][i : i + num_events], axis=1
)
if multisig:
bg_tot = np.sum(
shift_preds[jshift][i : i + num_events][num_sigs:], axis=1
)
ggf_score = shift_preds[jshift][i : i + num_events, 0]
vbf_score = shift_preds[jshift][i : i + num_events, 1]

events["BDTScore_" + jshift] = ggf_score / (ggf_score + bg_tot)
events["BDTScoreVBF_" + jshift] = vbf_score / (vbf_score + bg_tot)
events["BDTScoreQCD_" + jshift] = shift_preds[jshift][
i : i + num_events, num_sigs
]
events["BDTScoreTT_" + jshift] = shift_preds[jshift][
i : i + num_events, num_sigs + 1
]
events["BDTScoreZJets_" + jshift] = shift_preds[jshift][
i : i + num_events, num_sigs + 2
]
else:
events["BDTScore_" + jshift] = shift_preds[jshift][
i : i + num_events, 0
]
events["BDTScoreQCD_" + jshift] = shift_preds[jshift][
i : i + num_events, 1
]
events["BDTScoreTT_" + jshift] = shift_preds[jshift][
i : i + num_events, 2
]
events["BDTScoreZJets_" + jshift] = 1 - np.sum(
shift_preds[jshift][i : i + num_events], axis=1
)

i += num_events

Expand Down

0 comments on commit 2e7f2b3

Please sign in to comment.