diff --git a/configs/run3_studies/test.py b/configs/run3_studies/test.py new file mode 100644 index 00000000..8b34f0f9 --- /dev/null +++ b/configs/run3_studies/test.py @@ -0,0 +1,99 @@ + +import sys +import os +filedir = os.path.dirname(os.path.realpath(__file__)) +pyrootdir = os.path.dirname(filedir) +basedir = os.path.dirname(pyrootdir) +sys.path.append(pyrootdir) +sys.path.append(basedir) + +import util.tools.plotClasses as plotClasses +import util.variableHistoInterface as vhi +import ROOT +from array import array +from copy import deepcopy + + +memexp = "" + + + +def plots_ge4j_ge4t_classifier(data = None): + label = "\geq 4 jets, \geq 4 b-tags" + interfaces = [] + selection = "(N_Jets>=4&&N_BTagsM>=4)&&(1.)" + + plots = [ + plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Evt_CSV_avg","Evt_CSV_avg",50,0.15,1.0),"Evt_CSV_avg",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Evt_CSV_avg_tagged","Evt_CSV_avg_tagged",50,0.3,1.0),"Evt_CSV_avg_tagged",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Evt_Deta_JetsAverage","Evt_Deta_JetsAverage",50,0.0,3.0),"Evt_Deta_JetsAverage",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Evt_Deta_TaggedJetsAverage","Evt_Deta_TaggedJetsAverage",50,0.0,3.0),"Evt_Deta_TaggedJetsAverage",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Evt_M_minDrLepTag","Evt_M_minDrLepTag",50,15.0,400.0),"Evt_M_minDrLepTag",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Evt_Pt_JetsAverage","Evt_Pt_JetsAverage",50,30.0,500.0),"Evt_Pt_JetsAverage",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Evt_Pt_minDrTaggedJets","Evt_Pt_minDrTaggedJets",50,20.0,600.0),"Evt_Pt_minDrTaggedJets",selection,label), + plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_N_Jets","N_Jets",7,3.5,10.5),"N_Jets",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Reco_JABDT_tHW_Jet_CSV_btop","Reco_JABDT_tHW_Jet_CSV_btop",50,-1.5,1.0),"Reco_JABDT_tHW_Jet_CSV_btop",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Reco_JABDT_tHq_Jet_CSV_hdau1","Reco_JABDT_tHq_Jet_CSV_hdau1",50,-1.5,1.0),"Reco_JABDT_tHq_Jet_CSV_hdau1",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Reco_JABDT_tHq_abs_ljet_eta","Reco_JABDT_tHq_abs_ljet_eta",50,-1.5,4.5),"Reco_JABDT_tHq_abs_ljet_eta",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Reco_JABDT_ttbar_Jet_CSV_whaddau2","Reco_JABDT_ttbar_Jet_CSV_whaddau2",50,0.0,1.0),"Reco_JABDT_ttbar_Jet_CSV_whaddau2",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Reco_tHq_bestJABDToutput","Reco_tHq_bestJABDToutput",50,-1.0,0.7),"Reco_tHq_bestJABDToutput",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_Reco_ttbar_toplep_m","Reco_ttbar_toplep_m",50,80.0,500.0),"Reco_ttbar_toplep_m",selection,label), + # plotClasses.Plot(ROOT.TH1D("ljets_ge4j_ge4t_classifier_memDBp","MEM",50,0.0,1.0),memexp,selection,label), + ] + + + if data: + add_data_plots(plots=plots,data=data) + return plots + + +def getDiscriminatorPlots(data = None, discrname = ''): + discriminatorPlots = [] + discriminatorPlots += plots_ge4j_ge4t_classifier(data) + + return discriminatorPlots + + +def init_plots(interfaces, data = None): + plots = [] #init list of plotClasses objects to return + dictionary = {} + for interf in interfaces: + + # check if initialization uses bin edges or min/max vals + # if 'subdict' contains the keyword 'bin_edges', an array + # of type float is created from the corresponding python list. + # Else, the min/maxvals are used + if not interf.bin_edges is None: + bins = array("f", interf.bin_edges) + nbins = len(bins)-1 # last bin edge in array is overflow bin => subtract for nbins + interf.nhistobins = nbins # update number of bins + plots.append( + plotClasses.Plot( + ROOT.TH1F(interf.histoname,interf.histotitle,nbins,bins), + interf.varname,interf.selection,interf.category_label)) + + elif not (interf.minxval is None or interf.maxxval is None): + nbins = interf.nhistobins + xmax = interf.maxxval + xmin = interf.minxval + plots.append( + plotClasses.Plot( + ROOT.TH1F(interf.histoname,interf.histotitle,nbins,xmin, xmax), + interf.varname,interf.selection,interf.category_label)) + else: + print("FATAL ERROR: Unable to load bin edges or min/max values for histogram!") + print(interf) + raise ValueError + dictionary[interf.label] = interf.getDictionary() + + if not data is None: + data.categories.update(dictionary) + + return plots + +def add_data_plots(plots, data): + plotnames = [] + for plot in plots: + plotnames.append(plot.name) + data.datavariables.extend(plotnames) + \ No newline at end of file diff --git a/plottingscripts/run3_studies.py b/plottingscripts/run3_studies.py new file mode 100644 index 00000000..f9fe3614 --- /dev/null +++ b/plottingscripts/run3_studies.py @@ -0,0 +1,363 @@ +#!/usr/bin/python2 +import sys +import os +import imp +import inspect +import optparse +import ROOT +ROOT.PyConfig.IgnoreCommandLineOptions = True + +filedir = os.path.dirname(os.path.realpath(__file__)) +pyrootdir = "/".join(filedir.split("/")[:-1]) + +sys.path.append(pyrootdir) + +# local imports +import util.analysisClass as analysisClass +import util.configClass as configClass +import util.monitorTools as monitorTools +import util.plotParallel as plotParallel +import util.makePlots as makePlots +import util.haddParallel as haddParallel +import util.checkHistos as checkHistos +import util.makeDatacards as makeDatacards + +def main(pyrootdir, opts): + print ''' + # ======================================================== + # welcome to main function - defining some variables + # ======================================================== + ''' + # name of the analysis (i.e. workdir name) + name = 'run3_studies/first_steps' + + # path to workdir subfolder where all information should be saved + workdir = pyrootdir + "/workdir/" + name + + # signal process + signalProcess = "ttH" + nSigSamples = 1 + + # dataera + dataera = "2018" + + # Name of final discriminator, should not contain underscore + discrName = '' + nom_histname_template = "$CHANNEL__$PROCESS" + syst_histname_template = nom_histname_template + "__$SYSTEMATIC" + histname_separator = "__" + + # define MEM discriminator variable + memexp = "(memDBp>=0.0)*(memDBp)+(memDBp<0.0)*(0.01)+(memDBp==1.0)*(0.01)" + # configs + config = "legacyAnalysis/samples_2018" + variable_cfg = "legacyAnalysis/additionalVariables" + plot_cfg = "run3_studies/test" + # plot_cfg = "legacyAnalysis/DNN_14-05-2020/combined161718_DNNs" + # syst_cfg = "legacyAnalysis/systs_2018" + syst_cfg = "legacyAnalysis/no_systs" + replace_cfg = "legacyAnalysis/pdf_relic_names" + + sfCorrection = {} + sfCorrection["sfFile"] = pyrootdir+"/data/btagSFCorrection/sf_2018_deepJet_combined.root" + # variables for the correction + sfCorrection["corrections"] = {} + sfCorrection["corrections"]["HT_vs_NJet"] = ["Evt_HT_jets", "N_Jets"] + # in root file sf histograms exist with some naming scheme + sfCorrection["nameTemplate"] = "$BINNING__$PROCESS__$NAME" + # SF_ is always preprended by default, that should not be changed + # $BINNING = "_vs_".join(corrections[X]) + # DANGER: order of variables is important + # name of corrections to be applied (should match whats defined in syst.csv or samples.py) + sfCorrection["names"] = ["btag_NOMINAL"] + + # file for rate factors + #rateFactorsFile = pyrootdir + "/data/rate_factors_onlyinternal_powhegpythia.csv" + rateFactorsFile = pyrootdir + "/data/rateFactors/ratefactors_2018_with_correction.csv" + + # script options + analysisOptions = { + # general options + "usePseudoData": True, + "testrun": False, # test run with less samples + "stopAfterCompile": False, # stop script after compiling + # options to activate parts of the script + "haddFromWildcard": True, + "makeDataCards": False, + "makeInputDatacards": False, # create datacards also for all defined plots + "addData": True, # adding real data + "makePlots": True, + # options for makePlots + "signalScaling": -1, + "lumiLabel": True, + "cmslabel": "private Work", + "ratio": "#frac{S+B}{B}", + "shape": False, + "logarithmic": False, + "splitLegend": True, + "normalize": False, + # the skipX options try to skip the submission of files to the batch system + # before skipping the output is crosschecked + # if the output is not complete, the skipped part is done anyways + "skipPlotParallel": opts.skipPlotParallel, + "skipHaddParallel": opts.skipHaddParallel, + "skipHaddFromWildcard": opts.skipHaddFromWildcard, + "skipHistoCheck": opts.skipHistoCheck, + "skipMergeSysts": opts.skipMergeSysts, + "skipDatacards": opts.skipDatacards} + + plotJson = ""#pyrootdir+"/configs/legacyAnalysis/treeJson_2018.json" + # plotDataBases = [["memDB","/nfs/dust/cms/user/vdlinden/legacyTTH/memes/memTrees/2018/",True]] + # memDataBase = "/nfs/dust/cms/user/swieland/ttH_legacy/MEMdatabase/CodeforScriptGenerator/MEMDataBase/MEMDataBase" + # dnnInterface = {"interfacePath": pyrootdir+"/util/dNNInterfaces/MLfoyInterface.py", + # "checkpointFiles": pyrootdir+"/configs/legacyAnalysis/finalDNN/DNNInputData/"} + dnnInterface = None + + # path to datacardMaker directory + datacardmaker = "/nfs/dust/cms/user/lreuter/forPhilip/datacardMaker" + + print ''' + # ======================================================== + # initializing analysisClass + # ======================================================== + ''' + + # save a lot of useful information concerning the analysis + analysis = analysisClass.analysisConfig( + workdir = workdir, + pyrootdir = pyrootdir, + signalProcess = signalProcess, + pltcfgName = config, + discrName = discrName, + dataera = dataera) + + + analysis.initAnalysisOptions( analysisOptions ) + + pltcfg = analysis.initPlotConfig() + print "We will import the following plotconfig: ", analysis.getPlotConfig() + + # loading monitorTools module locally + monitor = monitorTools.init(analysis.workdir) + monitor.printClass(analysis, "init") + + print ''' + # ======================================================== + # prepare configdata + # ======================================================== + ''' + + configData = configClass.configData( + analysisClass = analysis, + variable_config = variable_cfg, + plot_config = plot_cfg, + execute_file = os.path.realpath(inspect.getsourcefile(lambda:0)), + replace_config = replace_cfg + ) + + configData.initSystematics(systconfig = syst_cfg) + + configData.initData() + + # get the discriminator plots + configData.genDiscriminatorPlots(memexp, dnnInterface) + configData.writeConfigDataToWorkdir() + monitor.printClass(configData, "init") + print ''' + # ======================================================== + # define additional variables necessary for selection in plotparallel + # ======================================================== + ''' + configData.getAddVariables() # also adds DNN variables + + print ''' + # ======================================================== + # loading samples and samples data + # ======================================================== + ''' + configData.initSamples() + + print ''' + # ======================================================== + # done with preprocessing + # ======================================================== + ''' + + # plot everything, except during drawParallel step + # Create file for data cards + print ''' + # ======================================================== + # starting with plotParallel step + # ======================================================== + ''' + + with monitor.Timer("plotParallel"): + # initialize plotParallel class + pP = plotParallel.plotParallel( + analysis = analysis, + configData = configData, + nominalHistKey = nom_histname_template, + systHistKey = syst_histname_template, + separator = histname_separator) + + monitor.printClass(pP, "init") + # set some changed values + pP.setJson(plotJson) + # pP.setDataBases(plotDataBases) + # pP.setMEMDataBase(memDataBase) + pP.setDNNInterface(dnnInterface) + pP.setMaxEvts_nom(50000) + # pP.setMaxEvts_nom(200000) + pP.setMaxEvts_systs(200000) + # pP.request_runtime = 60*60*5 + pP.setRateFactorsFile(rateFactorsFile) + pP.setSampleForVariableSetup(configData.samples[nSigSamples]) + pP.setSFCorrection(sfCorrection) + pP.setUseFriendTrees(True) + + # run plotParallel + pP.run() + + pP.checkTermination() + monitor.printClass(pP, "after plotParallel") + + # hadd histo files before renaming. The histograms are actually already renamed. + # But the checkbins thingy will not have been done yet. + print ''' + # ======================================================== + # hadding from wildcard + # ======================================================== + ''' + with monitor.Timer("haddFilesFromWildCard"): + haddParallel.haddSplitter( + input = pP.getHaddOutPath(), + outName = analysis.ppRootPath, + subName = "haddParts", + nHistosRemainSame = True, + skipHadd = analysis.skipHaddFromWildcard) + + + pP.setRenameInput() + if pP.configData.replace_config and not analysis.skipMergeSysts: + with monitor.Timer("mergeSystematics"): + print("merging systematics") + pP.mergeSystematics() + + # Deactivate check bins functionality in renameHistos + # if additional plot variables are added via analysis class + if os.path.exists( analysis.setRenamedPath(name = "limitInput") ): + print( "renamed file already exists - skipping renaming histos" ) + else: + print ''' + # ======================================================== + # renaming Histograms + # ======================================================== + ''' + + # in this function the variable self.renameInput is set + # if hadd files were created during plotParallel + # the renameInput is set to pP.getHaddFiles + # (a.k.a. the list of hadd files) + # if no hadd files were created during plotparallel + # the renameInput is set to pp.getOutPath + # (a.ka. the path to output.root) + + with monitor.Timer("checkHistos"): + checkHistos.checkHistsManager( + inFiles = pP.getRenameInput(), + outFile = analysis.renamedPath, + checkBins = True, + eps = 0.0, + skipHistoCheck = analysis.skipHistoCheck) + + + if analysis.addData: + print ''' + # ======================================================== + # adding data with plotParallel + # ======================================================== + ''' + with monitor.Timer("addRealData"): + if analysis.usePseudoData: + print("adding data_obs histograms as pseudo data") + # pseudo data without ttbb 5FS + # pP.addData( samples = configData.samples[:-1], + # discrName = discrName) + pP.addData( samples = configData.samples[:-9], + discrName = discrName) + #pP.addData(samples = configData.samples) + else: + print("adding data_obs histograms as real data") + # real data with ttH + pP.addData( samples = configData.controlSamples, + discrName = discrName) + + + + pP.checkTermination() + monitor.printClass(pP, "after plotParallel completely done") + + print("########## DONE WITH PLOTPARALLEL STEP ##########") + print("at the moment the outputpath is "+str(analysis.renamedPath)) + print("#################################################") + + if analysis.makeDataCards or analysis.makeInputDatacards and not opts.skipDatacards: + print ''' + # ======================================================== + # Making Datacards. + # ======================================================== + ''' + with monitor.Timer("makeDatacardsParallel"): + makeDatacards.makeDatacardsParallel( + filePath = analysis.renamedPath, + workdir = analysis.workdir, + categories = configData.getDatacardLabels(analysis.makeInputDatacards), + doHdecay = True, + discrname = analysis.discrName, + datacardmaker = datacardmaker, + signalTag = analysis.signalProcess, + skipDatacards = analysis.skipDatacards, + nominal_key = nom_histname_template.replace("__$CHANNEL","__finaldiscr_$CHANNEL"), + syst_key = syst_histname_template.replace("__$CHANNEL","__finaldiscr_$CHANNEL") + ) + + if analysis.makePlots: + print ''' + # ======================================================== + # Making Plots + # ======================================================== + ''' + with monitor.Timer("makePlots"): + makePlots.makePlots(configData = configData, + nominal_key = nom_histname_template, + syst_key = syst_histname_template) + + + print ''' + # ======================================================== + # this is the end of the script + # ======================================================== + ''' + +if __name__ == "__main__": + parser = optparse.OptionParser() + parser.add_option("--skipPlotParallel", dest = "skipPlotParallel", action = "store_true", default = False) + parser.add_option("--skipHaddParallel", dest = "skipHaddParallel", action = "store_true", default = False) + parser.add_option("--skipHaddFromWildcard", dest = "skipHaddFromWildcard", action = "store_true", default = False) + parser.add_option("--skipHistoCheck", dest = "skipHistoCheck", action = "store_true", default = False) + parser.add_option("--skipMergeSysts", dest = "skipMergeSysts", action = "store_true", default = False) + parser.add_option("--skipDatacards", dest = "skipDatacards", action = "store_true", default = False) + parser.add_option("--skip", dest = "skip", default = 0, type = "int", + help = "skip first INT parallel stages. plotParallel (1), haddParallel (2), haddFromWildcard (3), histoCheck (4), mergeSysts (5), Datacards (6)") + + (opts, args) = parser.parse_args() + + if opts.skip >= 1: opts.skipPlotParallel = True + if opts.skip >= 2: opts.skipHaddParallel = True + if opts.skip >= 3: opts.skipHaddFromWildcard = True + if opts.skip >= 4: opts.skipHistoCheck = True + if opts.skip >= 5: opts.skipMergeSysts = True + if opts.skip >= 6: opts.skipDatacards = True + + + main(pyrootdir, opts) diff --git a/util/configClass.py b/util/configClass.py index a9e0c55b..c1ffd4bb 100644 --- a/util/configClass.py +++ b/util/configClass.py @@ -1,8 +1,9 @@ import ROOT import os import sys -import importlib +# import importlib import pprint +import imp # local imports filedir = os.path.dirname(os.path.realpath(__file__)) @@ -83,8 +84,11 @@ def genDiscriminatorPlots(self, memexp, dnnInterface = None): if not dnnInterface: sys.exit("cannot load plots because no dnnInterface specified and no plotconfig") # loading dnn interface path = dnnInterface["interfacePath"] - sys.path.append(os.path.dirname(path)) - dnnModule = importlib.import_module( path.split("/")[-1].replace(".py","") ) + # sys.path.append(os.path.dirname(path)) + if not path.endswith(".py"): + path = path+".py" + # dnnModule = importlib.import_module( path.split("/")[-1].replace(".py","") ) + dnnModule = imp.load_source("dnnModule", path) interface = dnnModule.theInterface( dnnSet = dnnInterface["checkpointFiles"], crossEvaluation = self.analysis.crossEvaluation) @@ -101,9 +105,13 @@ def genDiscriminatorPlots(self, memexp, dnnInterface = None): self.plot_config = "plotconfig_local" sys.path.append(self.analysis.workdir) - fileName = self.cfgdir+"/"+self.plot_config - sys.path.append(os.path.dirname(fileName)) - configdatafile = importlib.import_module( os.path.basename(fileName) ) + fileName = os.path.join(self.cfgdir, self.plot_config) + # sys.path.append(os.path.dirname(fileName)) + # configdatafile = importlib.import_module( os.path.basename(fileName) ) + if not fileName.endswith(".py"): + fileName = fileName + ".py" + print("loading plot_config from '{}'".format(fileName)) + configdatafile = imp.load_source( "configdatafile", fileName) configdatafile.memexp = memexp self.discriminatorPlots = configdatafile.getDiscriminatorPlots(self.Data, self.analysis.discrName) @@ -129,10 +137,13 @@ def getDatacardLabels(self, doVariables = False, discrName = None): return bin_labels + self.getVariablelabels() def getAddVariables(self): - fileName = self.cfgdir+"/"+self.variable_config - sys.path.append(os.path.dirname(fileName)) + fileName = os.path.join(self.cfgdir, self.variable_config) + if not fileName.endswith(".py"): + fileName = fileName+".py" + # sys.path.append(os.path.dirname(fileName)) print("getting additional variables from "+str(fileName)) - addVarModule = importlib.import_module( os.path.basename(fileName) ) + # addVarModule = importlib.import_module( os.path.basename(fileName) ) + addVarModule = imp.load_source( "addVarModule", fileName ) self.addVars = addVarModule.getAddVars() def getSystSamples(self): diff --git a/util/plotParallel.py b/util/plotParallel.py index b9215842..f5400b0a 100644 --- a/util/plotParallel.py +++ b/util/plotParallel.py @@ -128,14 +128,15 @@ def setAddInterfaces(self, interfaces): print( "unknown additional code interface type: " + str(interface) ) def setDNNInterface(self, interfaceConfig): - interfacePath = interfaceConfig["interfacePath"] - checkpointFiles = interfaceConfig["checkpointFiles"] - addModule = "addModule"+str(len(self.addInterfaces)+1) - print("loading module "+str(interfacePath)+" as "+addModule+" module.") - self.addInterfaces.append( - imp.load_source(addModule, interfacePath).theInterface( - self.analysis.workdir, checkpointFiles, - crossEvaluation = self.analysis.crossEvaluation)) + if interfaceConfig: + interfacePath = interfaceConfig["interfacePath"] + checkpointFiles = interfaceConfig["checkpointFiles"] + addModule = "addModule"+str(len(self.addInterfaces)+1) + print("loading module "+str(interfacePath)+" as "+addModule+" module.") + self.addInterfaces.append( + imp.load_source(addModule, interfacePath).theInterface( + self.analysis.workdir, checkpointFiles, + crossEvaluation = self.analysis.crossEvaluation)) def setRateFactorsFile(self, csvfile): self.rateFactorsFile = csvfile