Skip to content

Commit

Permalink
script bug fixes + wjets-training arg
Browse files Browse the repository at this point in the history
  • Loading branch information
rkansal47 committed Mar 12, 2024
1 parent e779dbc commit 95c7fbe
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 35 deletions.
21 changes: 12 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ using the [coffea](https://coffeateam.github.io/coffea/) and
- [BDT Pre-Processing](#bdt-pre-processing)
- [BDT Trainings](#bdt-trainings)
- [Post-Processing](#post-processing-1)
- [Control plots with resonant and nonresonant samples](#control-plots-with-resonant-and-nonresonant-samples)
- [Control plots](#control-plots)
- [BDT sculpting plots](#bdt-sculpting-plots)
- [Making separate background and signal templates for scan and bias tests (resonant)](#making-separate-background-and-signal-templates-for-scan-and-bias-tests-resonant)
- [Create Datacard](#create-datacard)
Expand Down Expand Up @@ -325,29 +325,32 @@ python TrainBDT.py --data-path "../../../../data/skimmer/Feb24/bdt_data" --year

### Post-Processing

**Important:** If running on a Mac, make sure to install `gnu-getopt` first for
bash scripts, see [here.](#getopt-for-mac)

```bash
python postprocessing.py --templates --year "2017" --template-dir "templates/$TAG/" --plot-dir "../../../plots/PostProcessing/$TAG/" --data-dir "../../../../data/skimmer/Feb24/" (--resonant --signal-data-dir "" --control-plots)
```

All years (non-resonant):

```bash
for year in 2016 2016APV 2017 2018; do python -u postprocessing.py --templates --year $year --template-dir "templates/Jun14" --data-dir "../../../../data/skimmer/Feb24" --signal-data-dir "../../../../data/skimmer/Jun10" --bdt-preds-dir "../../../../data/skimmer/Feb24/23_05_12_multiclass_rem_feats_3/inferences"; done
./bash_scripts/NonresTemplates.sh --tag $TAG # remember to change data_dir as needed!
```

Scan (non-resonant):

```bash
for year in 2016 2016APV 2017 2018; do python -u postprocessing.py --templates --year $year --template-dir "templates/$TAG/" --data-dir "../../../../data/skimmer/Feb24/" --old-processor --nonres-txbb-wp "LP" "MP" "HP" --nonres-bdt-wp 0.995 0.998 0.999 --no-do-jshifts; done
./bash_scripts/NonresTemplatesScan.sh --tag $TAG # remember to change data_dir as needed!
```

#### Control plots with resonant and nonresonant samples

Run `postprocessing/bash_scripts/ControlPlots.sh` from inside
`postprocessing folder`.
#### Control plots

**Important:** If running on a Mac, make sure to install `gnu-getopt` first, see
[here.](#getopt-for-mac)
```bash
./bash_scripts/ControlPlot.sh --tag $TAG # w/ resonant and nonresonant samples and all control plot variables in postprocessing.py script by default
./bash_scripts/ControlPlots.sh --tag $TAG --nonresonant --controlplotvars BDTScore --nohem2d # BDT score only
./bash_scripts/MassPlots.sh --tag $TAG # mSD vs mReg plots
```

#### BDT sculpting plots

Expand Down
39 changes: 27 additions & 12 deletions src/HHbbVV/postprocessing/TrainBDT.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,6 @@

weight_key = "finalWeight"
sig_key = "HHbbVV"
bg_keys = ["QCD", "TT", "Z+Jets"]
training_keys = [sig_key] + bg_keys

# if doing multiclass classification, encode each process separately
label_encoder = LabelEncoder()
label_encoder.fit(training_keys)


# only vars used for training, ordered by importance
Expand Down Expand Up @@ -145,7 +139,9 @@ def get_X(
return pd.concat(X, axis=0), mc_vars


def get_Y(data_dict: dict[str, pd.DataFrame], multiclass: bool = False):
def get_Y(
data_dict: dict[str, pd.DataFrame], multiclass: bool = False, label_encoder: LabelEncoder = None
):
Y = []
for _year, data in data_dict.items():
if multiclass:
Expand Down Expand Up @@ -179,7 +175,10 @@ def remove_neg_weights(data: pd.DataFrame):


def equalize_weights(
data: pd.DataFrame, equalize_sig_bg: bool = True, equalize_per_process: bool = False
data: pd.DataFrame,
bg_keys: list[str],
equalize_sig_bg: bool = True,
equalize_per_process: bool = False,
):
"""
If `equalize_sig_bg`: scales signal such that total signal = total background
Expand Down Expand Up @@ -214,6 +213,15 @@ def load_data(data_path: str, year: str, all_years: bool):


def main(args):
bg_keys = ["QCD", "TT", "Z+Jets"]
if args.wjets_training:
bg_keys += ["W+Jets"]
training_keys = [sig_key] + bg_keys

# if doing multiclass classification, encode each process separately
label_encoder = LabelEncoder()
label_encoder.fit(training_keys)

bdtVars = AllTaggerBDTVars if args.all_tagger_vars else SingleTaggerBDTVars

early_stopping_callback = xgb.callback.EarlyStopping(
Expand Down Expand Up @@ -314,7 +322,9 @@ def main(args):
f'{np.sum(data[data["Dataset"] == key][weight_key])}'
)

equalize_weights(data, args.equalize_weights, args.equalize_weights_per_process)
equalize_weights(
data, bg_keys, args.equalize_weights, args.equalize_weights_per_process
)

for key in training_keys:
print(
Expand All @@ -341,8 +351,8 @@ def main(args):
model = train_model(
get_X(train, bdtVars),
get_X(test, bdtVars),
get_Y(train, args.multiclass),
get_Y(test, args.multiclass),
get_Y(train, args.multiclass, label_encoder),
get_Y(test, args.multiclass, label_encoder),
get_weights(train, args.absolute_weights),
get_weights(test, args.absolute_weights),
bdtVars,
Expand All @@ -358,6 +368,7 @@ def main(args):
train,
test,
args.test_size,
training_keys,
args.equalize_weights,
bdtVars,
multiclass=args.multiclass,
Expand Down Expand Up @@ -430,6 +441,7 @@ def evaluate_model(
train: dict[str, pd.DataFrame],
test: dict[str, pd.DataFrame],
test_size: float,
training_keys: list[str],
equalize_sig_bg: bool,
bdtVars: list[str],
txbb_threshold: float = 0.98,
Expand Down Expand Up @@ -677,6 +689,8 @@ def do_inference(
type=int,
)

add_bool_arg(parser, "wjets-training", "Include W+Jets in training", default=False)

"""
Varying between 0.01 - 1 showed no significant difference
https://hhbbvv.nrp-nautilus.io/bdt/24_03_07_new_samples_lr_0.01/
Expand Down Expand Up @@ -704,9 +718,10 @@ def do_inference(
help="minimum weight required to keep splitting (higher is more conservative)",
type=float,
)

# This just needs to be higher than the # rounds needed for early-stopping to kick in
parser.add_argument(
"--n-estimators", default=1 - 000, help="max number of trees to keep adding", type=int
"--n-estimators", default=10000, help="max number of trees to keep adding", type=int
)

parser.add_argument("--rem-feats", default=0, help="remove N lowest importance feats", type=int)
Expand Down
20 changes: 13 additions & 7 deletions src/HHbbVV/postprocessing/bash_scripts/ControlPlots.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,30 @@
####################################################################################################

MAIN_DIR="../../.."
data_dir="$MAIN_DIR/../data/skimmer/24Mar6AllYearsBDTVars"
TAG=""
resonant="--resonant"
samples="HHbbVV VBFHHbbVV NMSSM_XToYHTo2W2BTo4Q2B_MX-900_MY-80 NMSSM_XToYHTo2W2BTo4Q2B_MX-1200_MY-190 NMSSM_XToYHTo2W2BTo4Q2B_MX-2000_MY-125 NMSSM_XToYHTo2W2BTo4Q2B_MX-3000_MY-250 NMSSM_XToYHTo2W2BTo4Q2B_MX-4000_MY-150"
# samples="HHbbVV"
hem2d="--HEM2d"
controlplotvars=""

options=$(getopt -o "" --long "nonresonant,nohem2d,tag:" -- "$@")
options=$(getopt -o "" --long "nonresonant,nohem2d,controlplotvars:,tag:" -- "$@")
eval set -- "$options"

while true; do
case "$1" in
--nonresonant)
resonant=""
samples="HHbbVV VBFHHbbVV"
samples="HHbbVV VBFHHbbVV qqHH_CV_1_C2V_0_kl_1_HHbbVV qqHH_CV_1_C2V_2_kl_1_HHbbVV"
;;
--nohem2d)
hem2d=""
;;
--controlplotvars)
shift
controlplotvars="--control-plot-vars $1"
;;
--tag)
shift
TAG=$1
Expand All @@ -56,12 +63,11 @@ if [[ -z $TAG ]]; then
exit 1
fi

# for year in 2016APV 2016 2017 2018
for year in 2018
for year in 2016APV 2016 2017 2018
do
python -u postprocessing.py --control-plots --year $year ${resonant} ${hem2d} \
--data-dir "${MAIN_DIR}/../data/skimmer/24Mar5AllYears" \
--data-dir $data_dir \
--sig-samples $samples \
--bdt-preds-dir "${MAIN_DIR}/../data/skimmer/Feb24/23_05_12_multiclass_rem_feats_3/inferences" \
--plot-dir "${MAIN_DIR}/plots/PostProcessing/$TAG"
--bdt-preds-dir "$MAIN_DIR/../data/skimmer/24Mar6AllYearsBDTVars/24_03_07_new_samples_max_depth_5/inferences" \
--plot-dir "${MAIN_DIR}/plots/PostProcessing/$TAG" ${controlplotvars}
done
14 changes: 7 additions & 7 deletions src/HHbbVV/postprocessing/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class Region:
control_plot_vars = [
ShapeVar(var="MET_pt", label=r"$p^{miss}_T$ (GeV)", bins=[20, 0, 300]),
# ShapeVar(var="DijetEta", label=r"$\eta^{jj}$", bins=[20, -8, 8]),
ShapeVar(var="DijetPt", label=r"$p_T^{jj}$ (GeV)", bins=[20, 0, 750]),
# ShapeVar(var="DijetPt", label=r"$p_T^{jj}$ (GeV)", bins=[20, 0, 750]),
ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[20, 600, 4000]),
ShapeVar(var="bbFatJetEta", label=r"$\eta^{bb}$", bins=[20, -2.4, 2.4]),
ShapeVar(
Expand Down Expand Up @@ -114,12 +114,12 @@ class Region:
# ShapeVar(var="VVFatJetParTMD_probT", label=r"Prob(Top) (Mass-Decorrelated)", bins=[20, 0, 1]),
ShapeVar(var="VVFatJetParTMD_THWWvsT", label=r"$T^{VV}_{HWW}$", bins=[20, 0, 1]),
# ShapeVar(var="bbFatJetPtOverDijetPt", label=r"$p^{bb}_T / p_T^{jj}$", bins=[20, 0, 40]),
ShapeVar(var="VVFatJetPtOverDijetPt", label=r"$p^{VV}_T / p_T^{jj}$", bins=[20, 0, 40]),
ShapeVar(var="VVFatJetPtOverbbFatJetPt", label=r"$p^{VV}_T / p^{bb}_T$", bins=[20, 0.4, 2.0]),
# ShapeVar(var="VVFatJetPtOverDijetPt", label=r"$p^{VV}_T / p_T^{jj}$", bins=[20, 0, 40]),
# ShapeVar(var="VVFatJetPtOverbbFatJetPt", label=r"$p^{VV}_T / p^{bb}_T$", bins=[20, 0.4, 2.0]),
ShapeVar(var="nGoodMuonsHbb", label=r"# of Muons", bins=[3, 0, 3]),
ShapeVar(var="nGoodMuonsHH", label=r"# of Muons", bins=[3, 0, 3]),
# ShapeVar(var="nGoodMuonsHH", label=r"# of Muons", bins=[3, 0, 3]),
ShapeVar(var="nGoodElectronsHbb", label=r"# of Electrons", bins=[3, 0, 3]),
ShapeVar(var="nGoodElectronsHH", label=r"# of Electrons", bins=[3, 0, 3]),
# ShapeVar(var="nGoodElectronsHH", label=r"# of Electrons", bins=[3, 0, 3]),
# removed if not ggF nonresonant - needs to be the last variable!
ShapeVar(var="BDTScore", label=r"BDT Score", bins=[50, 0, 1]),
]
Expand Down Expand Up @@ -415,7 +415,6 @@ def main(args):
# only need to worry about variations if making templates
events_dict = _load_samples(args, bg_samples, sig_samples, cutflow, variations=args.templates)
bb_masks = bb_VV_assignment(events_dict)

# QCD xsec normalization for plots
qcd_sf(events_dict, cutflow)

Expand All @@ -424,6 +423,7 @@ def main(args):
events_dict,
bb_masks,
nonres_vars=args.vbf or args.control_plots,
# nonres_vars=args.vbf,
vbf_vars=args.vbf,
do_jshifts=args.vbf, # only need shifts for BDT pre-processing
)
Expand Down Expand Up @@ -622,7 +622,7 @@ def _add_nonres_columns(df, bb_mask, vbf_vars=False, ptlabel="", mlabel=""):
VVJet = utils.make_vector(df, "VVFatJet", bb_mask, ptlabel=ptlabel, mlabel=mlabel)
Dijet = bbJet + VVJet

if f"DijetPt{ptlabel}{mlabel}" not in df.columns:
if f"DijetMass{ptlabel}{mlabel}" not in df.columns:
df[f"DijetMass{ptlabel}{mlabel}"] = Dijet.mass
df[f"DijetPt{ptlabel}{mlabel}"] = Dijet.pt
df[f"VVFatJetPtOverbbFatJetPt{ptlabel}{mlabel}"] = VVJet.pt / bbJet.pt
Expand Down
1 change: 1 addition & 0 deletions src/HHbbVV/processors/bbVVSkimmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ class bbVVSkimmer(SkimmerABC):
"VBFJetPt",
"VBFJetPhi",
"VBFJetMass",
"DijetMass",
"nGoodVBFJets",
"ak8FatJetHbb",
"ak8FatJetHVV",
Expand Down

0 comments on commit 95c7fbe

Please sign in to comment.