...
In the following, the most important excerpts are described.
Annotated Description
...
...
[4]
...
Load data using PANDAS data frames
Now you can start using your data and load three different NumPy arrays! One corresponds to the VBF signal and the other two will represent the Higgs boson production via the strong interaction processes (in jargon, QCD) and that will be used as a merged background.
Moreover, you will look at the physical observables that you can use to train the ML algorithms.
In [ ]:
#import libraries import uproot import numpy as np import pandas as pd import h5py import seaborn as sns from sklearn.utils import shuffle from sklearn.model_selection import train_test_split from sklearn.datasets import make_classification import tensorflow as tf from tensorflow import keras from tensorflow.keras.models import Sequential, Model from tensorflow.keras.optimizers import SGD, Adam, RMSprop, Adagrad, Adadelta from tensorflow.keras.layers import Input, Activation, Dense, Dropout from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint from tensorflow.keras import utils from tensorflow import random as tf_random from keras.utils import plot_model import random as python_random
# Fix random seed for reproducibility # The below is necessary for starting Numpy generated random numbers # in a well-defined initial state. seed = 7 np.random.seed(seed) # The below is necessary for starting core Python generated random numbers # in a well-defined state. python_random.seed(seed) # The below set_seed() will make random number generation # in the TensorFlow backend have a well-defined initial state. # For further details, see:
...
https://www.tensorflow.org/api_docs/python/tf/
...
random/set_seed
...
tf_random.set_seed(seed)
...
treename = 'HZZ4LeptonsAnalysisReduced'
...
filename = {} upfile = {} params = {} df = {} # Define what are the ROOT files we are interested in (for the two categories, # signal and background) filename['sig']
...
=
...
'VBF_HToZZTo4mu.root'
...
filename['bkg_ggHtoZZto4mu']
...
=
...
'GluGluHToZZTo4mu.root'
...
filename['bkg_ZZto4mu']
...
=
...
'ZZTo4mu.root'
...
#filename['bkg_ttH_HToZZ_4mu.root']=
...
'ttH_HToZZ_
...
4mu.root'
...
#filename['sig']
...
=
...
'VBF_HToZZTo4e.root'
...
#filename['bkg_ggHtoZZto4e']
...
=
...
'GluGluHToZZTo4e.
...
root'
...
#filename['bkg_ZZto4e']
...
=
...
'ZZTo4e.root'
...
# Variables from Root Tree that must be copyed to PANDA dataframe (df) VARS = [ 'f_run',
...
'f_event',
...
'f_weight',
...
\ 'f_massjj',
...
'f_deltajj',
...
'f_mass4l',
...
'f_Z1mass'
...
,
...
'f_Z2mass',
...
\ 'f_lept1_pt','f_lept1_eta','f_lept1_phi',
...
\ 'f_lept2_pt','f_lept2_eta','f_lept2_phi',
...
\ 'f_lept3_pt','f_lept3_eta','f_lept3_phi',
...
\ 'f_lept4_pt','f_lept4_eta','f_lept4_phi',
...
\ 'f_jet1_pt','f_jet1_eta','f_jet1_phi',
...
\ 'f_jet2_pt','f_jet2_eta','f_jet2_phi'
...
] #checking the dimensions of the df , 26 variables NDIM = len(VARS)
...
print("Number of kinematic variables imported from the ROOT files = %d"% NDIM) upfile['sig']
...
=
...
uproot.open(filename['sig'])
...
upfile['bkg_ggHtoZZto4mu']
...
=
...
uproot.open(filename['bkg_ggHtoZZto4mu'])
...
upfile['bkg_ZZto4mu']
...
=
...
uproot.open(filename['bkg_ZZto4mu'])
...
#upfile['bkg_ttH_HToZZ_4mu.root']
...
=
...
uproot.open(
...
filename['bkg_ttH_HToZZ_4mu'])
...
#upfile['sig']
...
=
...
uproot.open(filename['sig'])]
...
#upfile['bkg_ggHtoZZto4e']
...
=
...
uproot.open(filename['bkg_ggHtoZZto4e'])
...
#upfile['bkg_ZZto4e']
...
=
...
uproot.open(filename['bkg_ZZto4e'])
Number of kinematic variables imported from the ROOT files = 26
Let's see what you have uploaded in your Colab notebook!
...
In [ ]:
# Look at the signal and bkg events before applying physical requirement df['sig']
...
=
...
pd.DataFrame(upfile['sig'][treename].arrays(VARS,
...
library="np"),columns=VARS)
...
print(df['sig'].shape)
...
...
(24867, 26)
Comment: We have 24867 rows, i.e. 24867 different events, and 26 columns (whose meaning will be explained later).
Let's print out the first rows of this data set!
In [ ]:
df['sig'].head()
Out[ ]:
f_run | f_event | f_weight | f_massjj | f_deltajj | f_mass4l | f_Z1mass | f_Z2mass | f_lept1_pt | f_lept1_eta | f_lept1_phi | f_lept2_pt | f_lept2_eta | f_lept2_phi | f_lept3_pt | f_lept3_eta | f_lept3_phi | f_lept4_pt | f_lept4_eta | f_lept4_phi | f_jet1_pt | f_jet1_eta | f_jet1_phi | f_jet2_pt | f_jet2_eta | f_jet2_phi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 385228 | 0.000176 | 667.271423 | 3.739947 | 124.966576 | 90.768616 | 20.508274 | 82.890457 | 0.822203 | 1.343706 | 65.486946 | 0.382922 | 2.568485 | 39.838531 | 0.546917 | 2.497204 | 28.562206 | 0.174666 | 2.013540 | 116.326035 | -1.126533 | -1.759238 | 90.333893 | 2.613415 | -0.096671 |
1 | 1 | 385233 | 0.000127 | 129.085892 | 0.046317 | 120.231926 | 80.782318 | 34.261726 | 41.195362 | -0.534245 | 2.802684 | 24.911942 | -2.065928 | 0.371150 | 21.959597 | -1.219900 | -2.938914 | 16.676077 | -0.162915 | 1.783374 | 105.491882 | 3.253374 | -1.297283 | 38.978493 | 3.207056 | 1.553476 |
2 | 1 | 385254 | 0.000037 | 285.165222 | 3.166899 | 125.254646 | 91.392693 | 25.695290 | 80.788002 | 0.943778 | 0.729632 | 35.549721 | 0.935241 | 1.288549 | 23.206284 | 0.236346 | -2.670540 | 14.581854 | 1.516623 | 0.284658 | 69.315170 | 2.573589 | -2.030811 | 51.972664 | -0.593310 | -2.799394 |
3 | 1 | 385260 | 0.000043 | 52.006794 | 0.150803 | 125.067009 | 91.183708 | 19.631315 | 129.883423 | 0.235406 | -1.729384 | 37.950790 | 1.226075 | -2.540356 | 17.678413 | 0.096546 | -1.533120 | 8.197763 | -0.157577 | 0.339215 | 202.689468 | 2.530802 | 1.325786 | 41.343758 | 2.681605 | 0.858582 |
4 | 1 | 385263 | 0.000092 | 1044.083496 | 4.315164 | 124.305748 | 72.480515 | 43.826504 | 86.220734 | -0.226653 | 0.117277 | 80.451378 | -0.536749 | 0.385678 | 27.497240 | 0.827591 | -0.072236 | 21.243813 | -0.579560 | -0.884727 | 127.192223 | -2.362456 | -2.945257 | 115.200272 | 1.952708 | 2.053301 |
The first 2 columns contain information which are provided by experiments at the LHC that will not be used in the training of our Machine Learning algorithms, therefore we skip our explanation to the next columns.
The next variable is the
f_weights
. This corresponds to the probability of having that particular kind of physical process on the whole experiment. Indeed, it is a product of Branching Ratio (BR), geometrical acceptance of the detector, and kinematic phase-space. It is very important for the trainings phase and you will use it later.The variables
f_massjj
,f_deltajj
,f_mass4l
,f_Z1mass
, andf_Z2mass
are named high-level features (event features) since they contain overall information about the final-state particles (the mass of the two jets, their separation in space, the invariant mass of the four leptons, the masses of the two Z bosons). Note that the mass is lighter w.r.t. the one. Why is that? In the Higgs boson production (hypothesis of mass = 125 GeV) only one of the Z bosons is an actual particle that has the nominal mass of 91.18 GeV. The other one is a virtual (off-mass shell) particle.The remnant columns represent the low-level features (object kinematics observables), the basic measurements which are made by the detectors for the individual final state objects (in our case four charged leptons and jets) such as
f_lept1(2,3,4)_pt(phi,eta)
corresponding to their transverse momentum and the spatial distribution of their tracks ().
The same comments hold for the background datasets:
# Part of the code in "#" can be used in the second part of the exercise # for trying to use alternative datasets for the training of our ML algorithms #df['bkg'] = pd.DataFrame(upfile['bkg'][treename].arrays(VARS, library="np"),columns=VARS) #df['bkg'].head() df['bkg_ggHtoZZto4mu'] = pd.DataFrame(upfile['bkg_ggHtoZZto4mu'][treename].arrays(VARS, library="np"),columns=VARS) df['bkg_ggHtoZZto4mu'].head() #df['bkg_ggHtoZZto4e'] = pd.DataFrame(upfile['bkg_ggHtoZZto4e'][treename].arrays(VARS, library="np"),columns=VARS) #df['bkg_ggHtoZZto4e'].head() #df['bkg_ZZto4e'] = pd.DataFrame(upfile['bkg_ZZto4e'][treename].arrays(VARS, library="np"),columns=VARS) #df['bkg_ZZto4e'].head()
Out[ ]:
f_run | f_event | f_weight | f_massjj | f_deltajj | f_mass4l | f_Z1mass | f_Z2mass | f_lept1_pt | f_lept1_eta | f_lept1_phi | f_lept2_pt | f_lept2_eta | f_lept2_phi | f_lept3_pt | f_lept3_eta | f_lept3_phi | f_lept4_pt | f_lept4_eta | f_lept4_phi | f_jet1_pt | f_jet1_eta | f_jet1_phi | f_jet2_pt | f_jet2_eta | f_jet2_phi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 581632 | 0.000225 | -999.0 | -999.0 | 120.101105 | 88.262352 | 22.051540 | 57.572330 | -0.433627 | -0.886073 | 56.933735 | 0.496556 | 0.404675 | 33.584896 | -0.037387 | 0.291866 | 10.881461 | -1.112960 | 0.051097 | 73.541260 | 1.683280 | 2.736636 | -999.0 | -999.0 | -999.0 |
1 | 1 | 581659 | 0.000277 | -999.0 | -999.0 | 124.592812 | 82.174683 | 17.613417 | 50.365120 | 0.001362 | 0.933713 | 31.548225 | 0.598417 | -1.863556 | 22.758055 | 0.220867 | -2.767246 | 17.264626 | 0.361964 | -1.859138 | -999.000000 | -999.000000 | -999.000000 | -999.0 | -999.0 | -999.0 |
2 | 1 | 581671 | 0.000278 | -999.0 | -999.0 | 125.692230 | 79.915764 | 29.998011 | 72.355927 | -0.238323 | -2.335623 | 20.644920 | -0.241560 | 1.855536 | 16.031651 | -1.446993 | 1.185016 | 11.068296 | 0.366903 | -0.606845 | 64.440544 | 1.886244 | 1.635723 | -999.0 | -999.0 | -999.0 |
3 | 1 | 581724 | 0.000336 | -999.0 | -999.0 | 125.027504 | 85.200958 | 23.440151 | 43.059235 | 0.759979 | -1.714778 | 19.248983 | 0.535979 | 0.420337 | 16.595169 | -1.330326 | 1.656061 | 11.407483 | -0.686118 | 1.295116 | -999.000000 | -999.000000 | -999.000000 | -999.0 | -999.0 | -999.0 |
4 | 1 | 581744 | 0.000273 | -999.0 | -999.0 | 124.917282 | 65.971390 | 14.968305 | 52.585011 | -0.656421 | -2.933651 | 35.095982 | -1.002568 | 0.865173 | 28.146715 | -0.730926 | -0.876442 | 8.034222 | -1.094436 | 1.783626 | -999.000000 |
df['bkg_ZZto4mu'] = pd.DataFrame(upfile['bkg_ZZto4mu'][treename].arrays(VARS, library="np"),columns=VARS) df['bkg_ZZto4mu'].head()
f_run | f_event | f_weight | f_massjj | f_deltajj | f_mass4l | f_Z1mass | f_Z2mass | f_lept1_pt | f_lept1_eta | f_lept1_phi | f_lept2_pt | f_lept2_eta | f_lept2_phi | f_lept3_pt | f_lept3_eta | f_lept3_phi | f_lept4_pt | f_lept4_eta | f_lept4_phi | f_jet1_pt | f_jet1_eta | f_jet1_phi | f_jet2_pt | f_jet2_eta | f_jet2_phi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1991117 | 0.001420 | 384.394165 | 0.235409 | 309.921478 | 93.538399 | 87.436043 | 84.918190 | -0.073681 | -1.339234 | 60.143539 | -1.229701 | -1.409149 | 54.892681 | 1.125339 | -1.046433 | 42.139397 | 0.966109 | 2.593184 | 240.828506 | 0.103300 | 2.408482 | 195.838226 | 0.338708 | 0.285348 |
1 | 1 | 1991192 | 0.000893 | 110.589844 | 0.956070 | 326.481903 | 92.948936 | 85.379288 | 124.270218 | 1.388811 | -1.738097 | 87.379723 | 0.766540 | 2.502843 | 54.472603 | -0.106614 | 1.626933 | 18.505959 | 2.012172 | 2.229677 | 77.210411 | 2.061765 | -0.532572 | 48.432365 | 1.105695 | 1.128457 |
2 | 1 | 1991331 | 0.000839 | -999.000000 | -999.000000 | 91.167046 | 56.161217 | 14.535084 | 25.241573 | 1.410529 | 2.080089 | 21.971258 | 1.465800 | 1.868505 | 20.648312 | 0.787121 | 2.017863 | 9.831321 | -1.329539 | 2.286600 | 66.642792 | 1.176917 | -1.089489 | -999.000000 | -999.000000 | -999.000000 |
3 | 1 | 1991364 | 0.000906 | -999.000000 | -999.000000 | 323.428345 | 88.717270 | 94.940346 | 65.728729 | -0.561113 | 2.596448 | 50.528595 | 2.227971 | 0.101310 | 39.392380 | 0.294608 | -1.756674 | 33.169487 | 0.367907 | -0.241346 | -999.000000 | -999.000000 | -999.000000 | -999.000000 | -999.000000 | -999.000000 |
4 | 1 | 1991360 | 0.001034 | -999.000000 | -999.000000 | 274.207916 | 90.799271 | 90.156898 | 101.931305 | 0.828778 | 2.440133 | 89.171135 | -0.052834 |
# Let's merge our background processes together! df['bkg'] = pd.concat([df['bkg_ZZto4mu'],df['bkg_ggHtoZZto4mu']]) # Let's shuffle them! df['bkg']= shuffle(df['bkg']) # Let's see its shape! print(df['bkg'].shape) #print(len(df['bkg'])) #print(len(df['bkg_ZZto4mu'])) #print(len(df['bkg_ggHtoZZto4mu'])) #print(len(df['bkg_ggHtoZZto4e'])) #print(len(df['bkg_ZZto4e']))
(952342, 26)
Note that the background datasets seem to have a very large number of events! Is that true? Do all physical variables have meaningful values? Let's make physical selection requirements!
# Remove undefined variable entries VARS[i] <= -999 for i in range(NDIM): df['sig'] = df['sig'][(df['sig'][VARS[i]] > -999)] df['bkg']= df['bkg'][(df['bkg'][VARS[i]] > -999)] # Add the columnisSignal to the dataframe containing the truth information # i.e. it tells if that particular event is signal (isSignal=1) or background (isSignal=0) df['sig']['isSignal'] = np.ones(len(df['sig'])) df['bkg']['isSignal'] = np.zeros(len(df['bkg'])) print("Number of Signal events = %d " %len(df['sig']['isSignal'])) print("Number of Background events = %d " %len(df['bkg']['isSignal']))
Number of Signal events = 14260
Number of Background events = 100724