diff --git a/data/generate_dataset.py b/data/generate_dataset.py
index e3e0e91..613b06d 100644
--- a/data/generate_dataset.py
+++ b/data/generate_dataset.py
@@ -112,23 +112,14 @@ def bboxes_from_file(times_v, fish_freq, rise_idx, rise_size, fish_baseline_freq
         lower_freq_bound = lower_freq_bound[mask]
         upper_freq_bound = upper_freq_bound[mask]
 
-        dt_bbox = right_time_bound - left_time_bound
-        df_bbox = upper_freq_bound - lower_freq_bound
-
-        # embed()
-        # quit()
-        # left_time_bound -= dt_bbox + 0.01 * (t1 - t0)
-        # right_time_bound += dt_bbox + 0.01 * (t1 - t0)
-        # lower_freq_bound -= df_bbox + 0.01 * (f1 - f0)
-        # upper_freq_bound += df_bbox + 0.01 * (f1 - f0)
+        # dt_bbox = right_time_bound - left_time_bound
+        # df_bbox = upper_freq_bound - lower_freq_bound
 
         left_time_bound -= 0.01 * (t1 - t0)
         right_time_bound += 0.05 * (t1 - t0)
         lower_freq_bound -= 0.01 * (f1 - f0)
         upper_freq_bound += 0.05 * (f1 - f0)
 
-        # embed()
-        # quit()
         mask2 = ((left_time_bound >= t0) &
                 (right_time_bound <= t1) &
                 (lower_freq_bound >= f0) &
@@ -150,15 +141,11 @@ def bboxes_from_file(times_v, fish_freq, rise_idx, rise_size, fish_baseline_freq
                          lower_freq_bound,
                          upper_freq_bound,
                          x0, y0, x1, y1])
-        # test_s = ['a', 'a', 'a', 'a']
         tmp_df = pd.DataFrame(
-            # index= [pic_save_str for i in range(len(left_time_bound))],
-            # index= test_s,
             data=bbox.T,
             columns=cols
         )
         bbox_df = pd.concat([bbox_df, tmp_df], ignore_index=True)
-        # bbox_df.append(tmp_df)
     return bbox_df
 
 def main(args):
@@ -200,6 +187,7 @@ def main(args):
         #     )
         plt.show()
 
+    # Hyperparameter
     min_freq = 200
     max_freq = 1500
     d_freq = 200
@@ -207,28 +195,34 @@ def main(args):
     d_time = 60*10
     time_overlap = 60*1
 
+    # init dataframe if not existent so far
+    eval_files = []
     if not os.path.exists(os.path.join('train', 'bbox_dataset.csv')):
         cols = ['image', 't0', 't1', 'f0', 'f1', 'x0', 'y0', 'x1', 'y1']
         bbox_df = pd.DataFrame(columns=cols)
 
+    # else load datafile ... and check for already regarded files (eval_files)
     else:
         bbox_df = pd.read_csv(os.path.join('train', 'bbox_dataset.csv'), sep=',', index_col=0)
         cols = list(bbox_df.keys())
-        eval_files = []
         # ToDo: make sure not same file twice
         for f in pd.unique(bbox_df['image']):
             eval_files.append(f.split('__')[0])
 
+    # find folders that have fine_specs...
     folders = list(f.parent for f in Path(args.folder).rglob('fill_times.npy'))
 
-    # embed()
-    # quit()
 
     for enu, folder in enumerate(folders):
         print(f'DataSet generation from {folder} | {enu+1}/{len(folders)}')
+        # check for those folders where rises are detected
         if not (folder/'analysis'/'rise_idx.npy').exists():
             continue
 
+        # embed()
+        # quit()
+        # ToDo: check if folder in eval_files ... is so: continue
+
         freq, times, spec, EODf_v, ident_v, idx_v, times_v, fish_freq, rise_idx, rise_size, fish_baseline_freq, fish_baseline_freq_time = (
             load_data(folder))
         f_res, t_res = freq[1] - freq[0], times[1] - times[0]
diff --git a/data/train_test_split.py b/data/train_test_split.py
new file mode 100644
index 0000000..4d1e222
--- /dev/null
+++ b/data/train_test_split.py
@@ -0,0 +1,44 @@
+import pandas as pd
+from pathlib import Path
+import numpy as np
+import os
+
+from IPython import embed
+
+def define_train_test_img_names(bbox, test_size = 0.2):
+    np.random.seed(42)
+    unique_imgs = np.asarray(pd.unique(bbox['image']))
+    np.random.shuffle(unique_imgs)
+
+    test_img = sorted(unique_imgs[:int(len(unique_imgs) * test_size)])
+    train_img = sorted(unique_imgs[int(len(unique_imgs) * test_size):])
+
+    return test_img, train_img
+
+def split_data_df_in_test_train_df(bbox, test_img, train_img):
+    cols = list(bbox.keys())
+
+    test_bbox = pd.DataFrame(columns=cols)
+    train_bbox = pd.DataFrame(columns=cols)
+
+    for img_name in test_img:
+        tmp_df = bbox[bbox['image'] == img_name]
+        test_bbox = pd.concat([test_bbox, tmp_df], ignore_index=True)
+
+    for img_name in train_img:
+        tmp_df = bbox[bbox['image'] == img_name]
+        train_bbox = pd.concat([train_bbox, tmp_df], ignore_index=True)
+
+    return train_bbox, test_bbox, cols
+def main(path):
+    bbox = pd.read_csv(path/'bbox_dataset.csv', sep=',', index_col=0)
+
+    test_img, train_img = define_train_test_img_names(bbox)
+
+    train_bbox, test_bbox, cols = split_data_df_in_test_train_df(bbox, test_img, train_img)
+
+    train_bbox.to_csv(path/'bbox_train.csv', columns=cols, sep=',')
+    test_bbox.to_csv(path/'bbox_test.csv', columns=cols, sep=',')
+
+if __name__ == '__main__':
+    main(Path('./train'))
\ No newline at end of file