@@ -1350,7 +1350,7 @@ def fifths2acc(fifths: Tuple[int]) -> Tuple[str]: ...
1350
1350
1351
1351
1352
1352
def fifths2acc (
1353
- fifths : Union [int , pd .Series , NDArray [int ], List [int ], Tuple [int ]]
1353
+ fifths : Union [int , pd .Series , NDArray [int ], List [int ], Tuple [int ]],
1354
1354
) -> Union [str , pd .Series , NDArray [str ], List [str ], Tuple [str ]]:
1355
1355
"""Returns accidentals for a stack of fifths that can be combined with a
1356
1356
basic representation of the seven steps."""
@@ -2453,6 +2453,7 @@ def parse_interval_index_column(df, column=None, closed="left"):
2453
2453
"next" : str2inttuple ,
2454
2454
"nominal_duration" : safe_frac ,
2455
2455
"quarterbeats" : safe_frac ,
2456
+ "quarterbeats_playthrough" : safe_frac ,
2456
2457
"quarterbeats_all_endings" : safe_frac ,
2457
2458
"onset" : safe_frac ,
2458
2459
"duration" : safe_frac ,
@@ -2545,8 +2546,10 @@ def parse_interval_index_column(df, column=None, closed="left"):
2545
2546
"phraseend" : "Phrase Annotation" ,
2546
2547
"piece" : "Piece identifier" ,
2547
2548
# 'playthrough':
2548
- "quarterbeats" : "Offset from Beginning" ,
2549
- "quarterbeats_all_endings" : "Offset from Beginning (Including Endings)" ,
2549
+ "quarterbeats" : "Offset from Beginning (leaving out alternative endings)" ,
2550
+ "quarterbeats_playthrough" : "Offset from the beginning, including all repeats (in unfolded tables)" ,
2551
+ "quarterbeats_all_endings" : "Offset from Beginning (counting through alternative "
2552
+ "endings as if they were adjacent bars)" ,
2550
2553
"regex_match" : "Regular Expression Match" ,
2551
2554
"relativeroot" : "Relative Root" ,
2552
2555
"repeats" : "Repeats" ,
@@ -4789,6 +4792,31 @@ def enforce_piece_index_for_metadata(
4789
4792
return metadata_df .set_index ("piece" , append = append )
4790
4793
4791
4794
4795
+ def overwrite_overview_section_in_markdown_file (file_path , md_str , logger = None ):
4796
+ if logger is None :
4797
+ logger = module_logger
4798
+ elif isinstance (logger , str ):
4799
+ logger = get_logger (logger )
4800
+ if os .path .isfile (file_path ):
4801
+ msg = "Updated"
4802
+ with open (file_path , "r" , encoding = "utf-8" ) as f :
4803
+ lines = f .readlines ()
4804
+ else :
4805
+ msg = "Created"
4806
+ lines = []
4807
+ # in case the README.md exists, everything from the line including '# Overview' (or last line otherwise) is
4808
+ # overwritten
4809
+ with open (file_path , "w" , encoding = "utf-8" ) as f :
4810
+ for line in lines :
4811
+ if "# Overview" in line :
4812
+ break
4813
+ f .write (line )
4814
+ else :
4815
+ f .write ("\n \n " )
4816
+ f .write (md_str )
4817
+ logger .info (f"{ msg } { file_path } " )
4818
+
4819
+
4792
4820
def write_markdown (metadata_df : pd .DataFrame , file_path : str , logger = None ) -> None :
4793
4821
"""
4794
4822
Write a subset of the DataFrame ``metadata_df`` to ``path`` in markdown format. If the file exists, it will be
@@ -4823,24 +4851,7 @@ def write_markdown(metadata_df: pd.DataFrame, file_path: str, logger=None) -> No
4823
4851
) # comes with a first-level heading which we turn into second-level
4824
4852
md_table += "\n \n *Overview table automatically updated using [ms3](https://ms3.readthedocs.io/).*\n "
4825
4853
4826
- if os .path .isfile (file_path ):
4827
- msg = "Updated"
4828
- with open (file_path , "r" , encoding = "utf-8" ) as f :
4829
- lines = f .readlines ()
4830
- else :
4831
- msg = "Created"
4832
- lines = []
4833
- # in case the README.md exists, everything from the line including '# Overview' (or last line otherwise) is
4834
- # overwritten
4835
- with open (file_path , "w" , encoding = "utf-8" ) as f :
4836
- for line in lines :
4837
- if "# Overview" in line :
4838
- break
4839
- f .write (line )
4840
- else :
4841
- f .write ("\n \n " )
4842
- f .write (md_table )
4843
- logger .info (f"{ msg } { file_path } " )
4854
+ overwrite_overview_section_in_markdown_file (file_path , md_table , logger )
4844
4855
4845
4856
4846
4857
def prepare_metadata_for_writing (metadata_df ):
@@ -6895,3 +6906,104 @@ def write_soup_to_mscx_file(
6895
6906
6896
6907
6897
6908
# endregion Functions for writing BeautifulSoup to MSCX file
6909
+ # region concatenating sub-corpus metadata
6910
+
6911
+
6912
+ def concat_metadata_tsv_files (path : str ) -> pd .DataFrame :
6913
+ """Walk through the first level of subdirectories and concatenate their metadata.tsv files."""
6914
+ _ , folders , _ = next (os .walk (path ))
6915
+ tsv_paths , keys = [], []
6916
+ for subdir in sorted (folders ):
6917
+ potential = os .path .join (path , subdir , "metadata.tsv" )
6918
+ if os .path .isfile (potential ):
6919
+ tsv_paths .append (potential )
6920
+ keys .append (subdir )
6921
+ if len (tsv_paths ) == 0 :
6922
+ return pd .DataFrame ()
6923
+ dfs = [pd .read_csv (tsv_path , sep = "\t " , dtype = "string" ) for tsv_path in tsv_paths ]
6924
+ try :
6925
+ concatenated = pd .concat (dfs , keys = keys )
6926
+ except AssertionError :
6927
+ info = "Levels: " + ", " .join (
6928
+ f"{ key } : { df .index .nlevels } ({ df .index .names } )"
6929
+ for key , df in zip (keys , dfs )
6930
+ )
6931
+ print (f"Concatenation of DataFrames failed due to an alignment error. { info } " )
6932
+ raise
6933
+ try :
6934
+ rel_path_col = next (
6935
+ col for col in ("subdirectory" , "rel_paths" ) if col in concatenated .columns
6936
+ )
6937
+ except StopIteration :
6938
+ raise ValueError (
6939
+ "Metadata is expected to come with a column called 'subdirectory' or (previously) 'rel_paths'."
6940
+ )
6941
+ rel_paths = [
6942
+ os .path .join (corpus , rel_path )
6943
+ for corpus , rel_path in zip (
6944
+ concatenated .index .get_level_values (0 ), concatenated [rel_path_col ].values
6945
+ )
6946
+ ]
6947
+ concatenated .loc [:, rel_path_col ] = rel_paths
6948
+ if "rel_path" in concatenated .columns :
6949
+ rel_paths = [
6950
+ os .path .join (corpus , rel_path )
6951
+ for corpus , rel_path in zip (
6952
+ concatenated .index .get_level_values (0 ), concatenated .rel_path .values
6953
+ )
6954
+ ]
6955
+ concatenated .loc [:, "rel_path" ] = rel_paths
6956
+ concatenated = concatenated .droplevel (1 )
6957
+ concatenated .index .rename ("corpus" , inplace = True )
6958
+ return concatenated
6959
+
6960
+
6961
+ def concatenated_metadata2markdown (concatenated ):
6962
+ try :
6963
+ fname_col = next (
6964
+ col for col in ("piece" , "fname" , "fnames" ) if col in concatenated .columns
6965
+ )
6966
+ except StopIteration :
6967
+ raise ValueError (
6968
+ "Metadata is expected to come with a column called 'piece' or (previously) 'fname' or 'fnames'."
6969
+ )
6970
+ rename4markdown = {
6971
+ fname_col : "file_name" ,
6972
+ "last_mn" : "measures" ,
6973
+ "label_count" : "labels" ,
6974
+ "harmony_version" : "standard" ,
6975
+ }
6976
+ concatenated = concatenated .rename (columns = rename4markdown )
6977
+ existing_columns = [
6978
+ col for col in rename4markdown .values () if col in concatenated .columns
6979
+ ]
6980
+ result = "# Overview"
6981
+ for corpus_name , df in concatenated [existing_columns ].groupby (level = 0 ):
6982
+ heading = f"\n \n ## { corpus_name } \n \n "
6983
+ md = str (dataframe2markdown (df .fillna ("" )))
6984
+ result += heading + md
6985
+ return result
6986
+
6987
+
6988
+ def concat_metadata (
6989
+ meta_corpus_dir : str , out : str , tsv_name = "concatenated_metadata.tsv" , logger = None
6990
+ ):
6991
+ """Concatenate metadata.tsv files from the sub-corpora of a meta-corpus, adapt the file paths, update the README."""
6992
+ if logger is None :
6993
+ logger = module_logger
6994
+ elif isinstance (logger , str ):
6995
+ logger = get_logger (logger )
6996
+ concatenated = concat_metadata_tsv_files (meta_corpus_dir )
6997
+ if len (concatenated ) == 0 :
6998
+ print (f"No metadata found in the child directories of { meta_corpus_dir } ." )
6999
+ return
7000
+ tsv_path = os .path .join (out , tsv_name )
7001
+ write_tsv (concatenated , tsv_path )
7002
+ md_str = concatenated_metadata2markdown (concatenated )
7003
+ md_path = os .path .join (out , "README.md" )
7004
+ overwrite_overview_section_in_markdown_file (
7005
+ file_path = md_path , md_str = md_str , logger = logger
7006
+ )
7007
+
7008
+
7009
+ # endregion concatenating sub-corpus metadata
0 commit comments