From 284728988655abe4458bf72e848a9d1a76e0c866 Mon Sep 17 00:00:00 2001 From: Sakimori Date: Mon, 1 Sep 2025 13:31:50 -0400 Subject: [PATCH] initial test of merging for susie --- csv_merge.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 csv_merge.py diff --git a/csv_merge.py b/csv_merge.py new file mode 100644 index 0000000..f941f26 --- /dev/null +++ b/csv_merge.py @@ -0,0 +1,62 @@ +import logging, os + +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser + +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def process(directory): + for subdir in [f.path for f in os.scandir(directory) if f.is_dir()]: #iterate over folders in directory + dirname = os.path.basename(subdir) + mainCSV = pd.read_csv(os.path.join(subdir, f"{dirname}.csv")) + candCSV = pd.read_csv(os.path.join(subdir, "cands","results_a.csv")) + merge(mainCSV, candCSV, os.path.join(subdir, f"{dirname}_merged.csv")) + +def merge(mainDF:pd.DataFrame, candDF:pd.DataFrame, outname): + #add new columns to main dataframe + colNum = len(mainDF.columns) + mainDF.insert(colNum, "probability", 0) + mainDF.insert(colNum+1, "label", 0) + + #iterate over candidates + for name, probability, label in candDF.itertuples(index=False): + #get dm and snr via string parsing the cand name + dm = float(name.split("dm_")[1].split("_")[0].split(".h5")[0].strip("0")) + snr = float(name.split("snr_")[1].split("_")[0].split(".h5")[0].strip("0")) + + #use those values to index the main dataframe and replace values + dmMatch = mainDF['dm'].map(lambda d: round(d, 2) == round(dm, 2)) + snrMatch = mainDF['snr'].map(lambda s: round(s,2) == round(snr, 2)) + row = mainDF[dmMatch & snrMatch] + if len(row) > 1: + logger.error(f"{outname}: Multiple matches found for DM {dm} and SNR {snr}.") + elif len(row) == 0: + logger.error(f"{outname}: No matches found for DM {dm} and SNR {snr}.") + else: + index = int(row.index[0]) + mainDF.loc[index, 'probability'] = probability + mainDF.loc[index, 'label'] = label + mainDF.to_csv(outname) + + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Merge results_a.csv into each main CSV file. Merged CSV has a _merged.csv suffix.", + formatter_class=ArgumentDefaultsHelpFormatter + ) + parser.add_argument('-p','--path', type=str, help="Main folder containing all observation folders.") + parser.add_argument('-d','--directory', type=str, help="Single folder to process.") + parser.set_defaults(path=None, directory=None) + values = parser.parse_args() + + if values.path is not None: + process(path) + elif values.directory is not None: + dirname = os.path.basename(values.directory) + mainCSV = pd.read_csv(os.path.join(values.directory, f"{dirname}.csv")) + candCSV = pd.read_csv(os.path.join(values.directory, "cands","results_a.csv")) + merge(mainCSV, candCSV, os.path.join(values.directory, f"{dirname}_merged.csv")) \ No newline at end of file