Merge pull request #2376 from desihub/modernize_proctables

Add script to update processing table column layout
desihub · Sep 30, 2024 · 4eae355 · 4eae355
2 parents 67201d7 + 43cd53f
commit 4eae355
Show file tree

Hide file tree

Showing 7 changed files with 251 additions and 44 deletions.
diff --git a/bin/desi_reformat_exposure_tables b/bin/desi_reformat_exposure_tables
diff --git a/bin/desi_reformat_exptables b/bin/desi_reformat_exptables
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+## Import some helper functions, you can see their definitions by uncomenting the bash shell command
+from desispec.scripts.reformat_exptables import get_parser, reformat_exposure_tables
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    args = parser.parse_args()
+
+    reformat_exposure_tables(**args.__dict__)
diff --git a/bin/desi_reformat_proctables b/bin/desi_reformat_proctables
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+## Import some helper functions, you can see their definitions by uncomenting the bash shell command
+from desispec.scripts.reformat_proctables import get_parser, reformat_processing_tables
+
+
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    args = parser.parse_args()
+
+    reformat_processing_tables(**args.__dict__)
diff --git a/doc/api.rst b/doc/api.rst
@@ -587,6 +587,9 @@ desispec API
 .. automodule:: desispec.scripts.reformat_exptables
     :members:
 
+.. automodule:: desispec.scripts.reformat_proctables
+    :members:
+
 .. automodule:: desispec.scripts.rejectcosmics
     :members:
 

diff --git a/py/desispec/scripts/reformat_exptables.py b/py/desispec/scripts/reformat_exptables.py
@@ -1,8 +1,9 @@
 """
-desispec.scripts.updateexptables
-================================
+desispec.scripts.reformat_exptables
+===================================
 
 """
+import argparse
 import os
 import sys
 import numpy as np
@@ -20,8 +21,31 @@
 from desispec.scripts.exposuretable import create_exposure_tables
 
 
+def get_parser():
+    """
+    Creates an arguments parser for the desi_reformat_exposure_tables script
+    """
+    parser = argparse.ArgumentParser(usage = "{prog} [options]")
+    parser.add_argument("-n", "--nights", type=str,  default=None, help="nights as comma separated string")
+    parser.add_argument("--night-range", type=str, default=None, help="comma separated pair of nights in form YYYYMMDD,YYYYMMDD"+\
+                                                                      "for first_night,last_night specifying the beginning"+\
+                                                                      "and end of a range of nights to be generated. "+\
+                                                                      "last_night should be inclusive.")
+    parser.add_argument("--obstypes", type=str, default=None, help="comma separated list of exposure types to include in "+\
+                                                           "the exposure table, e.g. science,arc,flat,dark,zero, ...")
+    parser.add_argument("-i", "--path-to-data", type=str, default=None, help="path to the raw input data")
+    parser.add_argument("-o","--exp-table-path", type=str, default=None,  help="path to save exposure tables, without monthly subdirectory")
+    parser.add_argument("--orig-filetype", type=str, default='csv', help="format type for original exposure tables")
+    parser.add_argument("--out-filetype", type=str, default='csv', help="format type for output exposure tables")
+    parser.add_argument("--verbose", action="store_true", help="print verbose output")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Perform a dry run, printing the changes that would be made and the final output table "+
+                             "but not overwriting the actual files on disk.")
+    parser.add_argument("--no-specprod", action="store_true", help="Create exposure table in repository location "+\
+                                                                     "rather than the SPECPROD location.")
+    return parser
 
-def update_exposure_tables(nights=None, night_range=None, path_to_data=None,
+def reformat_exposure_tables(nights=None, night_range=None, path_to_data=None,
                            exp_table_path=None, obstypes=None, orig_filetype='csv',
                            out_filetype='csv',  verbose=False, no_specprod=False,
                            dry_run=False):

diff --git a/py/desispec/scripts/reformat_proctables.py b/py/desispec/scripts/reformat_proctables.py
@@ -0,0 +1,194 @@
+"""
+desispec.scripts.reformat_proctables
+====================================
+
+"""
+import argparse
+import os
+import glob
+import sys
+import numpy as np
+import re
+import time
+from astropy.table import Table
+
+from desispec.io.meta import findfile
+from desispec.workflow.proctable import get_processing_table_column_defs
+from desispec.workflow.utils import define_variable_from_environment, listpath, \
+                                    pathjoin
+from desispec.workflow.tableio import write_table, load_table
+from desispec.scripts.exposuretable import create_exposure_tables
+
+
+def get_parser():
+    """
+    Creates an arguments parser for the desi_reformat_processing_tables script
+    """
+    parser = argparse.ArgumentParser(usage = "{prog} [options]")
+    parser.add_argument("-n", "--nights", type=str,  default=None, help="nights as comma separated string")
+    parser.add_argument("--night-range", type=str, default=None, help="comma separated pair of nights in form YYYYMMDD,YYYYMMDD"+\
+                                                                      "for first_night,last_night specifying the beginning"+\
+                                                                      "and end of a range of nights to be generated. "+\
+                                                                      "last_night should be inclusive.")
+    parser.add_argument("--orig-filetype", type=str, default='csv', help="format type for original exposure tables")
+    parser.add_argument("--out-filetype", type=str, default='csv', help="format type for output exposure tables")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Perform a dry run, printing the changes that would be made and the final output table "+
+                             "but not overwriting the actual files on disk.")
+    return parser
+
+def reformat_processing_tables(nights=None, night_range=None, orig_filetype='csv',
+                           out_filetype='csv', dry_run=False):
+    """
+    Generates updated processing tables for the nights requested. Requires
+    a current processing table to exist on disk.
+
+    Args:
+        nights: str, int, or comma separated list. The night(s) to generate
+                                                   processing tables for.
+        night_range: str. comma separated pair of nights in form
+                          YYYYMMDD,YYYYMMDD for first_night,last_night
+                          specifying the beginning and end of a range of
+                          nights to be generated. first_night and last_night are
+                          inclusive.
+        orig_filetype: str. The file extension (without the '.') of the processing
+                            tables.
+        out_filetype: str. The file extension for the outputted processing tables
+                           (without the '.').
+
+    Returns:
+        Nothing
+    """
+    # log = get_logger()
+    ## Make sure user specified what nights to run on
+    if nights is None and night_range is None:
+        raise ValueError("Must specify either nights or night_range."
+                         +" To process all nights give nights=all")
+
+    ## Get all nights in 2020's with data
+    proctab_template = findfile('proctable', night=99999999)
+    proctab_template = proctab_template.replace('99999999', '202[0-9][01][0-9][0-3][0-9]')
+    proctab_template = proctab_template.replace('.csv', f'.{orig_filetype}')
+    nights_with_proctables = list()
+    for ptabfn in glob.glob(proctab_template):
+        ## nights are 202YMMDD
+        matches = re.findall('202\d{5}', os.path.basename(ptabfn))
+        if len(matches) == 1:
+            n = int(matches[0])
+            nights_with_proctables.append(n)
+        else:
+            print(f"Couldn't parse a night from proctable file: {ptabfn}")
+
+    ## If unpecified or given "all", set nights to all nights with data
+    check_night = False
+    if nights is None or nights == 'all':
+        nights = nights_with_proctables
+        ## No need to check nights since derived from disk
+    else:
+        nights = [int(val.strip()) for val in nights.split(",")]
+        ## If nights are specified, make sure we check that there is actually data
+        check_night = True
+    nights = np.sort(nights)
+
+    ## If user specified a night range, cut nights to that range of dates
+    if night_range is not None:
+        if ',' not in night_range:
+            raise ValueError("night_range must be a comma separated pair of "
+                             + "nights in form YYYYMMDD,YYYYMMDD")
+        nightpair = night_range.split(',')
+        if len(nightpair) != 2 or not nightpair[0].isnumeric() \
+                or not nightpair[1].isnumeric():
+            raise ValueError("night_range must be a comma separated pair of "
+                             + "nights in form YYYYMMDD,YYYYMMDD")
+        first_night, last_night = nightpair
+        nights = nights[np.where(int(first_night) <= nights.astype(int))[0]]
+        nights = nights[np.where(int(last_night) >= nights.astype(int))[0]]
+
+    ## Get current set of expected columns
+    ptab_cols, ptab_dtypes, ptab_defs = get_processing_table_column_defs(return_default_values=True)
+    ptab_cols, ptab_dtypes = np.array(ptab_cols), np.array(ptab_dtypes)
+
+    ## Tell user the final list of nights and starting looping over them
+    print("Nights: ", nights)
+    for night in nights:
+        if check_night and night not in nights_with_proctables:
+            print(f"Night {night} doesn't have a processing table: Skipping.")
+            continue
+
+        ## If the processing table doesn't exist, skip, since we are updating
+        ## not generating.
+        orig_pathname = findfile('proctable', night=night).replace('.csv', f'.{orig_filetype}')
+        if not os.path.exists(orig_pathname):
+            print(f'Could not find processing table for night={night} at:'
+                  + f' {orig_pathname}. Skipping this night.')
+            continue
+
+        ## Load the old and new tables to compare
+        origtable = load_table(orig_pathname, tabletype='proctab')
+        curr_colnames = np.array(list(origtable.colnames))
+        expected_cols = np.isin(curr_colnames, ptab_cols)
+        found_cols = np.isin(ptab_cols, curr_colnames)
+
+        ## If everything is present, don't try to do anything
+        if np.all(expected_cols) and np.all(found_cols):
+            print(f"{orig_pathname} has all of the expected columns, not updating this table.")
+            continue
+
+        unexpected = list(curr_colnames[~expected_cols])
+        missing = list(ptab_cols[~found_cols])
+        print(f"Found the following unexpected columns: {unexpected}")
+        print(f"Found the following missing columns: {missing}")
+
+        ## Solving the only cases I'm currently aware of
+        if 'CAMWORD' in unexpected and 'PROCCAMWORD' in missing:
+            print(f"CAMWORD listed instead of PROCCAMWORD. Updating that.")
+            origtable.rename_column('CAMWORD', 'PROCCAMWORD')
+            unexpected.remove('CAWORD')
+            missing.remove('PROCCAMWORD')
+
+        if len(unexpected) > 0:
+            print(f"WARNING: Script detected unexpected columns. Only handle "
+                  + f"the case where 'CAMWORD' is defined instead of PROCCAMWORD. "
+                  + f"The following unexpected columns will be dropped without "
+                  + f"using the information they contain: {unexpected}.")
+            for colname in unexpected:
+                origtable.remove_column(colname)
+
+        ## Add any missing columns
+        for colname in missing:
+            if colname not in ['BADAMPS', 'LASTSTEP', 'EXPFLAG']:
+                print(f"WARNING: Script didn't expect {colname} to be missing. "
+                      + f"Replacing with default values, but this may have "
+                      + f"downstream consequences.")
+            colindex = np.where(ptab_cols==colname)[0][0]
+            newdat = [ptab_defs[colindex]] * len(origtable)
+            newcol = Table.Column(name=colname, data=newdat, dtype=ptab_dtypes[colindex])
+            origtable.add_column(newcol)
+
+        ## Finally, reorder to the current column ordering
+        origtable = origtable[list(ptab_cols)]
+
+        ## If just testing, print the table and a cell-by-cell equality test
+        ## for the scalar columns
+        ## If not testing, move the original table to an archived filename
+        ## and save the updated table to the official exptable pathname
+        if dry_run:
+            print("\n\nOutput file would have been:")
+            origtable.pprint_all()
+        else:
+            ftime = time.strftime("%Y%m%d_%Hh%Mm")
+            replaced_pathname = orig_pathname.replace(f".{orig_filetype}",
+                                                      f".replaced-{ftime}.{orig_filetype}")
+            print(f"Moving original file from {orig_pathname} to {replaced_pathname}")
+            os.rename(orig_pathname,replaced_pathname)
+            time.sleep(0.1)
+            out_pathname = orig_pathname.replace(f".{orig_filetype}", f".{out_filetype}")
+            write_table(origtable, out_pathname)
+            print(f"Updated file saved to {out_pathname}. Original archived as {replaced_pathname}")
+
+            print("\n\n")
+
+        ## Flush the outputs
+        sys.stdout.flush()
+        sys.stderr.flush()
+    print("Processing table regenerations complete")
diff --git a/py/desispec/workflow/proctable.py b/py/desispec/workflow/proctable.py
@@ -91,7 +91,7 @@ def get_processing_table_column_defs(return_default_values=False,
     coldeflt2 = [ 'a0123456789'    , 0          ,  -99   , ''       , 'unknown', defqid      ]
 
     colnames2 += [ 'SUBMIT_DATE', 'STATUS'     , 'SCRIPTNAME']
-    coltypes2 += [  int         , 'S14'        , 'S40'       ]
+    coltypes2 += [  int         , 'S14'        , 'S50'       ]
     coldeflt2 += [ -99          , 'UNSUBMITTED', ''   ]
 
     colnames2 += ['INT_DEP_IDS'                  , 'LATEST_DEP_QID'               , 'ALL_QIDS'                     ]