initial test of merging for susie

2025-09-01 13:31:50 -04:00 · 2025-09-01 13:31:50 -04:00 · 2847289886
parent 597e901f56
commit 2847289886
1 changed files with 62 additions and 0 deletions
--- a/csv_merge.py
+++ b/csv_merge.py
@ -0,0 +1,62 @@
 import logging, os
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
 import numpy as np
 import pandas as pd
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 def process(directory):
    for subdir in [f.path for f in os.scandir(directory) if f.is_dir()]: #iterate over folders in directory
        dirname = os.path.basename(subdir) 
        mainCSV = pd.read_csv(os.path.join(subdir, f"{dirname}.csv"))
        candCSV = pd.read_csv(os.path.join(subdir, "cands","results_a.csv"))
        merge(mainCSV, candCSV, os.path.join(subdir, f"{dirname}_merged.csv"))
 def merge(mainDF:pd.DataFrame, candDF:pd.DataFrame, outname):
    #add new columns to main dataframe
    colNum = len(mainDF.columns)
    mainDF.insert(colNum, "probability", 0)
    mainDF.insert(colNum+1, "label", 0)
    #iterate over candidates
    for name, probability, label in candDF.itertuples(index=False):
        #get dm and snr via string parsing the cand name
        dm = float(name.split("dm_")[1].split("_")[0].split(".h5")[0].strip("0"))
        snr = float(name.split("snr_")[1].split("_")[0].split(".h5")[0].strip("0"))
        #use those values to index the main dataframe and replace values
        dmMatch = mainDF['dm'].map(lambda d: round(d, 2) == round(dm, 2))
        snrMatch = mainDF['snr'].map(lambda s: round(s,2) == round(snr, 2))
        row = mainDF[dmMatch & snrMatch]
        if len(row) > 1:
            logger.error(f"{outname}: Multiple matches found for DM {dm} and SNR {snr}.")
        elif len(row) == 0:
            logger.error(f"{outname}: No matches found for DM {dm} and SNR {snr}.")
        else:
            index = int(row.index[0])
            mainDF.loc[index, 'probability'] = probability
            mainDF.loc[index, 'label'] = label
    mainDF.to_csv(outname)
 if __name__ == "__main__":
    parser = ArgumentParser(
 		description="Merge results_a.csv into each main CSV file. Merged CSV has a _merged.csv suffix.",
 		formatter_class=ArgumentDefaultsHelpFormatter
 	)
    parser.add_argument('-p','--path', type=str, help="Main folder containing all observation folders.")
    parser.add_argument('-d','--directory', type=str, help="Single folder to process.")
    parser.set_defaults(path=None, directory=None)
    values = parser.parse_args()
    if values.path is not None:
        process(path)
    elif values.directory is not None:
        dirname = os.path.basename(values.directory) 
        mainCSV = pd.read_csv(os.path.join(values.directory, f"{dirname}.csv"))
        candCSV = pd.read_csv(os.path.join(values.directory, "cands","results_a.csv"))
        merge(mainCSV, candCSV, os.path.join(values.directory, f"{dirname}_merged.csv"))