diff --git a/data/generate_dataset.py b/data/generate_dataset.py index e3e0e91..613b06d 100644 --- a/data/generate_dataset.py +++ b/data/generate_dataset.py @@ -112,23 +112,14 @@ def bboxes_from_file(times_v, fish_freq, rise_idx, rise_size, fish_baseline_freq lower_freq_bound = lower_freq_bound[mask] upper_freq_bound = upper_freq_bound[mask] - dt_bbox = right_time_bound - left_time_bound - df_bbox = upper_freq_bound - lower_freq_bound - - # embed() - # quit() - # left_time_bound -= dt_bbox + 0.01 * (t1 - t0) - # right_time_bound += dt_bbox + 0.01 * (t1 - t0) - # lower_freq_bound -= df_bbox + 0.01 * (f1 - f0) - # upper_freq_bound += df_bbox + 0.01 * (f1 - f0) + # dt_bbox = right_time_bound - left_time_bound + # df_bbox = upper_freq_bound - lower_freq_bound left_time_bound -= 0.01 * (t1 - t0) right_time_bound += 0.05 * (t1 - t0) lower_freq_bound -= 0.01 * (f1 - f0) upper_freq_bound += 0.05 * (f1 - f0) - # embed() - # quit() mask2 = ((left_time_bound >= t0) & (right_time_bound <= t1) & (lower_freq_bound >= f0) & @@ -150,15 +141,11 @@ def bboxes_from_file(times_v, fish_freq, rise_idx, rise_size, fish_baseline_freq lower_freq_bound, upper_freq_bound, x0, y0, x1, y1]) - # test_s = ['a', 'a', 'a', 'a'] tmp_df = pd.DataFrame( - # index= [pic_save_str for i in range(len(left_time_bound))], - # index= test_s, data=bbox.T, columns=cols ) bbox_df = pd.concat([bbox_df, tmp_df], ignore_index=True) - # bbox_df.append(tmp_df) return bbox_df def main(args): @@ -200,6 +187,7 @@ def main(args): # ) plt.show() + # Hyperparameter min_freq = 200 max_freq = 1500 d_freq = 200 @@ -207,28 +195,34 @@ def main(args): d_time = 60*10 time_overlap = 60*1 + # init dataframe if not existent so far + eval_files = [] if not os.path.exists(os.path.join('train', 'bbox_dataset.csv')): cols = ['image', 't0', 't1', 'f0', 'f1', 'x0', 'y0', 'x1', 'y1'] bbox_df = pd.DataFrame(columns=cols) + # else load datafile ... and check for already regarded files (eval_files) else: bbox_df = pd.read_csv(os.path.join('train', 'bbox_dataset.csv'), sep=',', index_col=0) cols = list(bbox_df.keys()) - eval_files = [] # ToDo: make sure not same file twice for f in pd.unique(bbox_df['image']): eval_files.append(f.split('__')[0]) + # find folders that have fine_specs... folders = list(f.parent for f in Path(args.folder).rglob('fill_times.npy')) - # embed() - # quit() for enu, folder in enumerate(folders): print(f'DataSet generation from {folder} | {enu+1}/{len(folders)}') + # check for those folders where rises are detected if not (folder/'analysis'/'rise_idx.npy').exists(): continue + # embed() + # quit() + # ToDo: check if folder in eval_files ... is so: continue + freq, times, spec, EODf_v, ident_v, idx_v, times_v, fish_freq, rise_idx, rise_size, fish_baseline_freq, fish_baseline_freq_time = ( load_data(folder)) f_res, t_res = freq[1] - freq[0], times[1] - times[0] diff --git a/data/train_test_split.py b/data/train_test_split.py new file mode 100644 index 0000000..4d1e222 --- /dev/null +++ b/data/train_test_split.py @@ -0,0 +1,44 @@ +import pandas as pd +from pathlib import Path +import numpy as np +import os + +from IPython import embed + +def define_train_test_img_names(bbox, test_size = 0.2): + np.random.seed(42) + unique_imgs = np.asarray(pd.unique(bbox['image'])) + np.random.shuffle(unique_imgs) + + test_img = sorted(unique_imgs[:int(len(unique_imgs) * test_size)]) + train_img = sorted(unique_imgs[int(len(unique_imgs) * test_size):]) + + return test_img, train_img + +def split_data_df_in_test_train_df(bbox, test_img, train_img): + cols = list(bbox.keys()) + + test_bbox = pd.DataFrame(columns=cols) + train_bbox = pd.DataFrame(columns=cols) + + for img_name in test_img: + tmp_df = bbox[bbox['image'] == img_name] + test_bbox = pd.concat([test_bbox, tmp_df], ignore_index=True) + + for img_name in train_img: + tmp_df = bbox[bbox['image'] == img_name] + train_bbox = pd.concat([train_bbox, tmp_df], ignore_index=True) + + return train_bbox, test_bbox, cols +def main(path): + bbox = pd.read_csv(path/'bbox_dataset.csv', sep=',', index_col=0) + + test_img, train_img = define_train_test_img_names(bbox) + + train_bbox, test_bbox, cols = split_data_df_in_test_train_df(bbox, test_img, train_img) + + train_bbox.to_csv(path/'bbox_train.csv', columns=cols, sep=',') + test_bbox.to_csv(path/'bbox_test.csv', columns=cols, sep=',') + +if __name__ == '__main__': + main(Path('./train')) \ No newline at end of file