From 95c7fbe8a4d146416c903ff42be565278055abd3 Mon Sep 17 00:00:00 2001 From: rkansal47 Date: Tue, 12 Mar 2024 11:39:43 -0500 Subject: [PATCH] script bug fixes + wjets-training arg --- README.md | 21 +++++----- src/HHbbVV/postprocessing/TrainBDT.py | 39 +++++++++++++------ .../bash_scripts/ControlPlots.sh | 20 ++++++---- src/HHbbVV/postprocessing/postprocessing.py | 14 +++---- src/HHbbVV/processors/bbVVSkimmer.py | 1 + 5 files changed, 60 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 7cfd0ff3..41aeebff 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ using the [coffea](https://coffeateam.github.io/coffea/) and - [BDT Pre-Processing](#bdt-pre-processing) - [BDT Trainings](#bdt-trainings) - [Post-Processing](#post-processing-1) - - [Control plots with resonant and nonresonant samples](#control-plots-with-resonant-and-nonresonant-samples) + - [Control plots](#control-plots) - [BDT sculpting plots](#bdt-sculpting-plots) - [Making separate background and signal templates for scan and bias tests (resonant)](#making-separate-background-and-signal-templates-for-scan-and-bias-tests-resonant) - [Create Datacard](#create-datacard) @@ -325,6 +325,9 @@ python TrainBDT.py --data-path "../../../../data/skimmer/Feb24/bdt_data" --year ### Post-Processing +**Important:** If running on a Mac, make sure to install `gnu-getopt` first for +bash scripts, see [here.](#getopt-for-mac) + ```bash python postprocessing.py --templates --year "2017" --template-dir "templates/$TAG/" --plot-dir "../../../plots/PostProcessing/$TAG/" --data-dir "../../../../data/skimmer/Feb24/" (--resonant --signal-data-dir "" --control-plots) ``` @@ -332,22 +335,22 @@ python postprocessing.py --templates --year "2017" --template-dir "templates/$TA All years (non-resonant): ```bash -for year in 2016 2016APV 2017 2018; do python -u postprocessing.py --templates --year $year --template-dir "templates/Jun14" --data-dir "../../../../data/skimmer/Feb24" --signal-data-dir "../../../../data/skimmer/Jun10" --bdt-preds-dir "../../../../data/skimmer/Feb24/23_05_12_multiclass_rem_feats_3/inferences"; done +./bash_scripts/NonresTemplates.sh --tag $TAG # remember to change data_dir as needed! ``` Scan (non-resonant): ```bash -for year in 2016 2016APV 2017 2018; do python -u postprocessing.py --templates --year $year --template-dir "templates/$TAG/" --data-dir "../../../../data/skimmer/Feb24/" --old-processor --nonres-txbb-wp "LP" "MP" "HP" --nonres-bdt-wp 0.995 0.998 0.999 --no-do-jshifts; done +./bash_scripts/NonresTemplatesScan.sh --tag $TAG # remember to change data_dir as needed! ``` -#### Control plots with resonant and nonresonant samples - -Run `postprocessing/bash_scripts/ControlPlots.sh` from inside -`postprocessing folder`. +#### Control plots -**Important:** If running on a Mac, make sure to install `gnu-getopt` first, see -[here.](#getopt-for-mac) +```bash +./bash_scripts/ControlPlot.sh --tag $TAG # w/ resonant and nonresonant samples and all control plot variables in postprocessing.py script by default +./bash_scripts/ControlPlots.sh --tag $TAG --nonresonant --controlplotvars BDTScore --nohem2d # BDT score only +./bash_scripts/MassPlots.sh --tag $TAG # mSD vs mReg plots +``` #### BDT sculpting plots diff --git a/src/HHbbVV/postprocessing/TrainBDT.py b/src/HHbbVV/postprocessing/TrainBDT.py index 33a4ebca..8511e1b4 100644 --- a/src/HHbbVV/postprocessing/TrainBDT.py +++ b/src/HHbbVV/postprocessing/TrainBDT.py @@ -47,12 +47,6 @@ weight_key = "finalWeight" sig_key = "HHbbVV" -bg_keys = ["QCD", "TT", "Z+Jets"] -training_keys = [sig_key] + bg_keys - -# if doing multiclass classification, encode each process separately -label_encoder = LabelEncoder() -label_encoder.fit(training_keys) # only vars used for training, ordered by importance @@ -145,7 +139,9 @@ def get_X( return pd.concat(X, axis=0), mc_vars -def get_Y(data_dict: dict[str, pd.DataFrame], multiclass: bool = False): +def get_Y( + data_dict: dict[str, pd.DataFrame], multiclass: bool = False, label_encoder: LabelEncoder = None +): Y = [] for _year, data in data_dict.items(): if multiclass: @@ -179,7 +175,10 @@ def remove_neg_weights(data: pd.DataFrame): def equalize_weights( - data: pd.DataFrame, equalize_sig_bg: bool = True, equalize_per_process: bool = False + data: pd.DataFrame, + bg_keys: list[str], + equalize_sig_bg: bool = True, + equalize_per_process: bool = False, ): """ If `equalize_sig_bg`: scales signal such that total signal = total background @@ -214,6 +213,15 @@ def load_data(data_path: str, year: str, all_years: bool): def main(args): + bg_keys = ["QCD", "TT", "Z+Jets"] + if args.wjets_training: + bg_keys += ["W+Jets"] + training_keys = [sig_key] + bg_keys + + # if doing multiclass classification, encode each process separately + label_encoder = LabelEncoder() + label_encoder.fit(training_keys) + bdtVars = AllTaggerBDTVars if args.all_tagger_vars else SingleTaggerBDTVars early_stopping_callback = xgb.callback.EarlyStopping( @@ -314,7 +322,9 @@ def main(args): f'{np.sum(data[data["Dataset"] == key][weight_key])}' ) - equalize_weights(data, args.equalize_weights, args.equalize_weights_per_process) + equalize_weights( + data, bg_keys, args.equalize_weights, args.equalize_weights_per_process + ) for key in training_keys: print( @@ -341,8 +351,8 @@ def main(args): model = train_model( get_X(train, bdtVars), get_X(test, bdtVars), - get_Y(train, args.multiclass), - get_Y(test, args.multiclass), + get_Y(train, args.multiclass, label_encoder), + get_Y(test, args.multiclass, label_encoder), get_weights(train, args.absolute_weights), get_weights(test, args.absolute_weights), bdtVars, @@ -358,6 +368,7 @@ def main(args): train, test, args.test_size, + training_keys, args.equalize_weights, bdtVars, multiclass=args.multiclass, @@ -430,6 +441,7 @@ def evaluate_model( train: dict[str, pd.DataFrame], test: dict[str, pd.DataFrame], test_size: float, + training_keys: list[str], equalize_sig_bg: bool, bdtVars: list[str], txbb_threshold: float = 0.98, @@ -677,6 +689,8 @@ def do_inference( type=int, ) + add_bool_arg(parser, "wjets-training", "Include W+Jets in training", default=False) + """ Varying between 0.01 - 1 showed no significant difference https://hhbbvv.nrp-nautilus.io/bdt/24_03_07_new_samples_lr_0.01/ @@ -704,9 +718,10 @@ def do_inference( help="minimum weight required to keep splitting (higher is more conservative)", type=float, ) + # This just needs to be higher than the # rounds needed for early-stopping to kick in parser.add_argument( - "--n-estimators", default=1 - 000, help="max number of trees to keep adding", type=int + "--n-estimators", default=10000, help="max number of trees to keep adding", type=int ) parser.add_argument("--rem-feats", default=0, help="remove N lowest importance feats", type=int) diff --git a/src/HHbbVV/postprocessing/bash_scripts/ControlPlots.sh b/src/HHbbVV/postprocessing/bash_scripts/ControlPlots.sh index 8c049254..0de07451 100755 --- a/src/HHbbVV/postprocessing/bash_scripts/ControlPlots.sh +++ b/src/HHbbVV/postprocessing/bash_scripts/ControlPlots.sh @@ -15,23 +15,30 @@ #################################################################################################### MAIN_DIR="../../.." +data_dir="$MAIN_DIR/../data/skimmer/24Mar6AllYearsBDTVars" TAG="" resonant="--resonant" samples="HHbbVV VBFHHbbVV NMSSM_XToYHTo2W2BTo4Q2B_MX-900_MY-80 NMSSM_XToYHTo2W2BTo4Q2B_MX-1200_MY-190 NMSSM_XToYHTo2W2BTo4Q2B_MX-2000_MY-125 NMSSM_XToYHTo2W2BTo4Q2B_MX-3000_MY-250 NMSSM_XToYHTo2W2BTo4Q2B_MX-4000_MY-150" +# samples="HHbbVV" hem2d="--HEM2d" +controlplotvars="" -options=$(getopt -o "" --long "nonresonant,nohem2d,tag:" -- "$@") +options=$(getopt -o "" --long "nonresonant,nohem2d,controlplotvars:,tag:" -- "$@") eval set -- "$options" while true; do case "$1" in --nonresonant) resonant="" - samples="HHbbVV VBFHHbbVV" + samples="HHbbVV VBFHHbbVV qqHH_CV_1_C2V_0_kl_1_HHbbVV qqHH_CV_1_C2V_2_kl_1_HHbbVV" ;; --nohem2d) hem2d="" ;; + --controlplotvars) + shift + controlplotvars="--control-plot-vars $1" + ;; --tag) shift TAG=$1 @@ -56,12 +63,11 @@ if [[ -z $TAG ]]; then exit 1 fi -# for year in 2016APV 2016 2017 2018 -for year in 2018 +for year in 2016APV 2016 2017 2018 do python -u postprocessing.py --control-plots --year $year ${resonant} ${hem2d} \ - --data-dir "${MAIN_DIR}/../data/skimmer/24Mar5AllYears" \ + --data-dir $data_dir \ --sig-samples $samples \ - --bdt-preds-dir "${MAIN_DIR}/../data/skimmer/Feb24/23_05_12_multiclass_rem_feats_3/inferences" \ - --plot-dir "${MAIN_DIR}/plots/PostProcessing/$TAG" + --bdt-preds-dir "$MAIN_DIR/../data/skimmer/24Mar6AllYearsBDTVars/24_03_07_new_samples_max_depth_5/inferences" \ + --plot-dir "${MAIN_DIR}/plots/PostProcessing/$TAG" ${controlplotvars} done diff --git a/src/HHbbVV/postprocessing/postprocessing.py b/src/HHbbVV/postprocessing/postprocessing.py index 2c719cb8..1526ffce 100644 --- a/src/HHbbVV/postprocessing/postprocessing.py +++ b/src/HHbbVV/postprocessing/postprocessing.py @@ -83,7 +83,7 @@ class Region: control_plot_vars = [ ShapeVar(var="MET_pt", label=r"$p^{miss}_T$ (GeV)", bins=[20, 0, 300]), # ShapeVar(var="DijetEta", label=r"$\eta^{jj}$", bins=[20, -8, 8]), - ShapeVar(var="DijetPt", label=r"$p_T^{jj}$ (GeV)", bins=[20, 0, 750]), + # ShapeVar(var="DijetPt", label=r"$p_T^{jj}$ (GeV)", bins=[20, 0, 750]), ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[20, 600, 4000]), ShapeVar(var="bbFatJetEta", label=r"$\eta^{bb}$", bins=[20, -2.4, 2.4]), ShapeVar( @@ -114,12 +114,12 @@ class Region: # ShapeVar(var="VVFatJetParTMD_probT", label=r"Prob(Top) (Mass-Decorrelated)", bins=[20, 0, 1]), ShapeVar(var="VVFatJetParTMD_THWWvsT", label=r"$T^{VV}_{HWW}$", bins=[20, 0, 1]), # ShapeVar(var="bbFatJetPtOverDijetPt", label=r"$p^{bb}_T / p_T^{jj}$", bins=[20, 0, 40]), - ShapeVar(var="VVFatJetPtOverDijetPt", label=r"$p^{VV}_T / p_T^{jj}$", bins=[20, 0, 40]), - ShapeVar(var="VVFatJetPtOverbbFatJetPt", label=r"$p^{VV}_T / p^{bb}_T$", bins=[20, 0.4, 2.0]), + # ShapeVar(var="VVFatJetPtOverDijetPt", label=r"$p^{VV}_T / p_T^{jj}$", bins=[20, 0, 40]), + # ShapeVar(var="VVFatJetPtOverbbFatJetPt", label=r"$p^{VV}_T / p^{bb}_T$", bins=[20, 0.4, 2.0]), ShapeVar(var="nGoodMuonsHbb", label=r"# of Muons", bins=[3, 0, 3]), - ShapeVar(var="nGoodMuonsHH", label=r"# of Muons", bins=[3, 0, 3]), + # ShapeVar(var="nGoodMuonsHH", label=r"# of Muons", bins=[3, 0, 3]), ShapeVar(var="nGoodElectronsHbb", label=r"# of Electrons", bins=[3, 0, 3]), - ShapeVar(var="nGoodElectronsHH", label=r"# of Electrons", bins=[3, 0, 3]), + # ShapeVar(var="nGoodElectronsHH", label=r"# of Electrons", bins=[3, 0, 3]), # removed if not ggF nonresonant - needs to be the last variable! ShapeVar(var="BDTScore", label=r"BDT Score", bins=[50, 0, 1]), ] @@ -415,7 +415,6 @@ def main(args): # only need to worry about variations if making templates events_dict = _load_samples(args, bg_samples, sig_samples, cutflow, variations=args.templates) bb_masks = bb_VV_assignment(events_dict) - # QCD xsec normalization for plots qcd_sf(events_dict, cutflow) @@ -424,6 +423,7 @@ def main(args): events_dict, bb_masks, nonres_vars=args.vbf or args.control_plots, + # nonres_vars=args.vbf, vbf_vars=args.vbf, do_jshifts=args.vbf, # only need shifts for BDT pre-processing ) @@ -622,7 +622,7 @@ def _add_nonres_columns(df, bb_mask, vbf_vars=False, ptlabel="", mlabel=""): VVJet = utils.make_vector(df, "VVFatJet", bb_mask, ptlabel=ptlabel, mlabel=mlabel) Dijet = bbJet + VVJet - if f"DijetPt{ptlabel}{mlabel}" not in df.columns: + if f"DijetMass{ptlabel}{mlabel}" not in df.columns: df[f"DijetMass{ptlabel}{mlabel}"] = Dijet.mass df[f"DijetPt{ptlabel}{mlabel}"] = Dijet.pt df[f"VVFatJetPtOverbbFatJetPt{ptlabel}{mlabel}"] = VVJet.pt / bbJet.pt diff --git a/src/HHbbVV/processors/bbVVSkimmer.py b/src/HHbbVV/processors/bbVVSkimmer.py index d40acea3..ce6b4551 100644 --- a/src/HHbbVV/processors/bbVVSkimmer.py +++ b/src/HHbbVV/processors/bbVVSkimmer.py @@ -123,6 +123,7 @@ class bbVVSkimmer(SkimmerABC): "VBFJetPt", "VBFJetPhi", "VBFJetMass", + "DijetMass", "nGoodVBFJets", "ak8FatJetHbb", "ak8FatJetHVV",