greenburstAux/csv_merge.py
2025-09-01 13:46:31 -04:00

67 lines
3 KiB
Python

import logging, os
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
import numpy as np
import pandas as pd
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
def process(directory):
for subdir in [f.path for f in os.scandir(directory) if f.is_dir()]: #iterate over folders in directory
dirname = os.path.basename(subdir)
if dirname == '':
dirname = os.path.basename(os.path.dirname(subdir))
mainCSV = pd.read_csv(os.path.join(subdir, f"{dirname}.csv"))
candCSV = pd.read_csv(os.path.join(subdir, "cands","results_a.csv"))
merge(mainCSV, candCSV, os.path.join(subdir, f"{dirname}_merged.csv"))
def merge(mainDF:pd.DataFrame, candDF:pd.DataFrame, outname):
#add new columns to main dataframe
colNum = len(mainDF.columns)
mainDF.insert(colNum, "probability", 0)
mainDF.insert(colNum+1, "label", 0)
#iterate over candidates
for props in candDF.itertuples(index=False):
#get dm and snr via string parsing the cand name
dm = float(props.candidate.split("dm_")[1].split("_")[0].split(".h5")[0].strip("0"))
snr = float(props.candidate.split("snr_")[1].split("_")[0].split(".h5")[0].strip("0"))
#use those values to index the main dataframe and replace values
dmMatch = mainDF['dm'].map(lambda d: round(d, 2) == round(dm, 2))
snrMatch = mainDF['snr'].map(lambda s: round(s,2) == round(snr, 2))
row = mainDF[dmMatch & snrMatch]
if len(row) > 1:
logger.error(f"{outname}: Multiple matches found for DM {dm} and SNR {snr}.")
elif len(row) == 0:
logger.error(f"{outname}: No matches found for DM {dm} and SNR {snr}.")
else:
index = int(row.index[0])
mainDF.loc[index, 'probability'] = props.probability
mainDF.loc[index, 'label'] = props.label
mainDF.to_csv(outname)
if __name__ == "__main__":
parser = ArgumentParser(
description="Merge results_a.csv into each main CSV file. Merged CSV has a _merged.csv suffix.",
formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument('-p','--path', type=str, help="Main folder containing all observation folders.")
parser.add_argument('-d','--directory', type=str, help="Single folder to process.")
parser.set_defaults(path=None, directory=None)
values = parser.parse_args()
if values.path is not None:
process(path)
elif values.directory is not None:
dirname = os.path.basename(values.directory)
if dirname == '':
dirname = os.path.basename(os.path.dirname(values.directory))
mainCSV = pd.read_csv(os.path.join(values.directory, f"{dirname}.csv"))
candCSV = pd.read_csv(os.path.join(values.directory, "cands","results_a.csv"))
logger.info(f"Working with {os.path.join(values.directory, f'{dirname}.csv')} and {os.path.join(values.directory, 'cands','results_a.csv')}")
merge(mainCSV, candCSV, os.path.join(values.directory, f"{dirname}_merged.csv"))