Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

An alternative way to check overfitting, and choosing correctly a threshold for selecting signal events, is plotting signal and background ANN predictions for the training and test datasets. If the distributions are quite similar it means that the algorithm learned how to generalize!
For measuring quantitatively the overfitting one can perform a Kolmogorov-Smirnov test that we will not implement here.

# Let's get signal and background events for both test and training dataset!

df_sig  = df['sig'].filter(NN_VARS)
df_bkg  = df['bkg'].filter(NN_VARS)

X_sig  = np.asarray( df_sig.values ).astype(np.float32)
X_bkg  = np.asarray( df_bkg.values ).astype(np.float32)

df_test = df_all.iloc[0:test_size+1]
df_train = df_all.iloc[test_size+1:size]

df_test_sig = df_test[(df_test['isSignal']>=1)].filter(NN_VARS)
df_test_bkg = df_test[(df_test['isSignal']<1)].filter(NN_VARS)

df_train_sig = df_train[(df_train['isSignal']>=1)].filter(NN_VARS)
df_train_bkg = df_train[(df_train['isSignal']<1)].filter(NN_VARS)

X_test_sig  = np.asarray( df_test_sig.values ).astype(np.float32)
X_test_bkg  = np.asarray( df_test_bkg.values ).astype(np.float32)
X_train_sig  = np.asarray( df_train_sig.values ).astype(np.float32)
X_train_bkg  = np.asarray( df_train_bkg.values ).astype(np.float32)

print('Test dataset shape:')
print(df_test.shape)
print('Test dataset signal shape:')
print(df_test_sig.shape)
print('Test dataset background shape:')
print(df_test_bkg.shape)
print('Training dataset shape' )
print(df_train.shape)
print('Training signal dataset shape' )
print(df_train_sig.shape)
print('Training background dataset shape' )
print(df_train_bkg.shape)

Y_test_sig = model.predict(X_test_sig) #flag predicted on all signal events
Y_test_bkg = model.predict(X_test_bkg) #flag predicted on all background events
Y_train_sig = model.predict(X_train_sig)
Y_train_bkg = model.predict(X_train_bkg)
Test dataset shape:
(22997, 27)
Test dataset signal shape:
(2870, 5)
Test dataset background shape:
(20127, 5)
Training dataset shape
(91987, 27)
Training signal dataset shape
(11390, 5)
Training background dataset shape
(80597, 5)
df_test.head()
f_runf_eventf_weightf_massjjf_deltajjf_mass4lf_Z1massf_Z2massf_lept1_ptf_lept1_etaf_lept1_phif_lept2_ptf_lept2_etaf_lept2_phif_lept3_ptf_lept3_etaf_lept3_phif_lept4_ptf_lept4_etaf_lept4_phif_jet1_ptf_jet1_etaf_jet1_phif_jet2_ptf_jet2_etaf_jet2_phiisSignal
91011809130.000075499.4156803.541091123.75025269.38652822.19623247.066288-1.938778-0.15717824.794939-1.4770992.68075521.430199-1.085863-0.47456316.9239370.011259-0.30433883.238281-2.0226971.94562984.3143461.518393-2.2817621.0
61307117994700.0000041034.7006845.445127123.12625187.02504030.89939154.3023341.2546650.49110134.2186051.576207-2.81967924.9332311.1952932.53072011.1068660.8362140.24099288.764404-1.835088-2.80926952.3511093.610039-2.0638010.0
4340651486363300.000015131.1002201.032331224.59153790.623093115.57325764.9857481.0223290.02078749.217106-0.7681050.15217143.2802050.537557-0.21153041.790005-0.1872012.840212114.5000840.6141503.13047532.3970491.646480-0.9251760.0
7559351543794980.00000483.6580731.574079201.77981695.84697085.43880572.0736160.108228-2.73020554.2195930.489068-1.29926924.958881-1.389947-2.60421813.0225901.9194281.66405046.5155451.1339490.13981544.397335-0.4401290.5332510.0
5041791984935690.000001652.3598633.799881335.02398790.21605792.984535126.7480391.168150-0.71131387.271675-0.707292
df_all.head()


f_runf_eventf_weightf_massjjf_deltajjf_mass4lf_Z1massf_Z2massf_lept1_ptf_lept1_etaf_lept1_phif_lept2_ptf_lept2_etaf_lept2_phif_lept3_ptf_lept3_etaf_lept3_phif_lept4_ptf_lept4_etaf_lept4_phif_jet1_ptf_jet1_etaf_jet1_phif_jet2_ptf_jet2_etaf_jet2_phiisSignal
91011809130.000075499.4156803.541091123.75025269.38652822.19623247.066288-1.938778-0.15717824.794939-1.4770992.68075521.430199-1.085863-0.47456316.9239370.011259-0.30433883.238281-2.0226971.94562984.3143461.518393-2.2817621.0
61307117994700.0000041034.7006845.445127123.12625187.02504030.89939154.3023341.2546650.49110134.2186051.576207-2.81967924.9332311.1952932.53072011.1068660.8362140.24099288.764404-1.835088-2.80926952.3511093.610039-2.0638010.0
4340651486363300.000015131.1002201.032331224.59153790.623093115.57325764.9857481.0223290.02078749.217106-0.7681050.15217143.2802050.537557-0.21153041.790005-0.1872012.840212114.5000840.6141503.13047532.3970491.646480-0.9251760.0
7559351543794980.00000483.6580731.574079201.77981695.84697085.43880572.0736160.108228-2.73020554.2195930.489068-1.29926924.958881-1.389947-2.60421813.0225901.9194281.66405046.5155451.1339490.13981544.397335-0.4401290.5332510.0
5041791984935690.000001652.3598633.799881335.02398790.21605792.984535126.7480391.168150-0.71131387.271675-0.7072921.16773241.464527-0.289785-0.50948114.898630-0.4704650.31710799.428864-3.4758052.92807799.2104490.324076-3.1020450.0
# Normalized Distribution of the ANN score for the whole dataset
# ax = plt.subplot(4, 2, 4)
X = np.linspace(0.0, 1.0, 100) #100 numbers between 0 and 1
plt.rcParams['figure.figsize'] = (10,5)
hist_test_sig = plt.hist(Y_test_sig, bins=X, label='test_sig',histtype='step',log=True,density=1)
hist_test_bkg = plt.hist(Y_test_bkg, bins=X, label='test_bkg',histtype='step',log=True,density=1)
hist_train_sig = plt.hist(Y_train_sig, bins=X, label='train_sig',histtype='step',log=True,density=1)
hist_train_bkg = plt.hist(Y_train_bkg, bins=X, label='train_bkg',histtype='step',log=True,density=1)
plt.xlabel('ANN score')
plt.ylabel('Frequency')
plt.legend( loc='upper right',prop={'size': 8} )
plt.title('ANN score normalized distribution on the whole dataset',fontsize=12,fontweight='bold', color='r')
plt.show()


References

Attachments