Skip to content

Commit ae1e10c

Browse files
author
Johannes Hentschel
authored
Concat script (#124)
2 parents efb3558 + 094a1b0 commit ae1e10c

File tree

3 files changed

+188
-25
lines changed

3 files changed

+188
-25
lines changed

.pre-commit-config.yaml

+4-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ exclude: '^docs/conf.py'
22

33
repos:
44
- repo: https://github.com/pre-commit/pre-commit-hooks
5-
rev: v4.6.0
5+
rev: v5.0.0
66
hooks:
77
- id: trailing-whitespace
88
- id: check-added-large-files
@@ -40,13 +40,13 @@ repos:
4040
hooks:
4141
- id: seed-isort-config
4242
- repo: https://github.com/pycqa/isort
43-
rev: 5.13.2
43+
rev: 6.0.0
4444
hooks:
4545
- id: isort
4646
args: ["--profile", "black", "--filter-files"]
4747

4848
- repo: https://github.com/ambv/black
49-
rev: 24.8.0
49+
rev: 25.1.0
5050
hooks:
5151
- id: black
5252
language_version: python3.10
@@ -59,7 +59,7 @@ repos:
5959
# additional_dependencies: [black]
6060

6161
- repo: https://github.com/PyCQA/flake8
62-
rev: 7.1.1
62+
rev: 7.1.2
6363
hooks:
6464
- id: flake8
6565
args:

src/ms3/utils/concat_metadata.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
import argparse
4+
import os
5+
6+
from ms3.cli import check_and_create, check_dir
7+
from ms3.utils import concat_metadata
8+
9+
10+
def run(args):
11+
"""Unpack the arguments and run the main function."""
12+
concat_metadata(
13+
meta_corpus_dir=args.dir,
14+
out=args.out,
15+
)
16+
17+
18+
################################################################################
19+
# COMMANDLINE INTERFACE
20+
################################################################################
21+
if __name__ == "__main__":
22+
parser = argparse.ArgumentParser(
23+
formatter_class=argparse.RawDescriptionHelpFormatter,
24+
description="""\
25+
-------------------------------------------------------------------
26+
| Script for generating metadata and README for meta repositories |
27+
-------------------------------------------------------------------
28+
29+
""",
30+
)
31+
parser.add_argument(
32+
"-d",
33+
"--dir",
34+
metavar="DIR",
35+
type=check_dir,
36+
help="Pass the root of the repository clone to gather metadata.tsv files from its child directories. "
37+
"Defaults to current working directory.",
38+
)
39+
parser.add_argument(
40+
"-o",
41+
"--out",
42+
metavar="OUT_DIR",
43+
type=check_and_create,
44+
help="""Output directory for TSV and MD file. Defaults to current working directory.""",
45+
)
46+
args = parser.parse_args()
47+
if args.dir is None:
48+
args.dir = os.getcwd()
49+
if args.out is None:
50+
args.out = os.getcwd()
51+
run(args)

src/ms3/utils/functions.py

+133-21
Original file line numberDiff line numberDiff line change
@@ -1350,7 +1350,7 @@ def fifths2acc(fifths: Tuple[int]) -> Tuple[str]: ...
13501350

13511351

13521352
def fifths2acc(
1353-
fifths: Union[int, pd.Series, NDArray[int], List[int], Tuple[int]]
1353+
fifths: Union[int, pd.Series, NDArray[int], List[int], Tuple[int]],
13541354
) -> Union[str, pd.Series, NDArray[str], List[str], Tuple[str]]:
13551355
"""Returns accidentals for a stack of fifths that can be combined with a
13561356
basic representation of the seven steps."""
@@ -2453,6 +2453,7 @@ def parse_interval_index_column(df, column=None, closed="left"):
24532453
"next": str2inttuple,
24542454
"nominal_duration": safe_frac,
24552455
"quarterbeats": safe_frac,
2456+
"quarterbeats_playthrough": safe_frac,
24562457
"quarterbeats_all_endings": safe_frac,
24572458
"onset": safe_frac,
24582459
"duration": safe_frac,
@@ -2545,8 +2546,10 @@ def parse_interval_index_column(df, column=None, closed="left"):
25452546
"phraseend": "Phrase Annotation",
25462547
"piece": "Piece identifier",
25472548
# 'playthrough':
2548-
"quarterbeats": "Offset from Beginning",
2549-
"quarterbeats_all_endings": "Offset from Beginning (Including Endings)",
2549+
"quarterbeats": "Offset from Beginning (leaving out alternative endings)",
2550+
"quarterbeats_playthrough": "Offset from the beginning, including all repeats (in unfolded tables)",
2551+
"quarterbeats_all_endings": "Offset from Beginning (counting through alternative "
2552+
"endings as if they were adjacent bars)",
25502553
"regex_match": "Regular Expression Match",
25512554
"relativeroot": "Relative Root",
25522555
"repeats": "Repeats",
@@ -4789,6 +4792,31 @@ def enforce_piece_index_for_metadata(
47894792
return metadata_df.set_index("piece", append=append)
47904793

47914794

4795+
def overwrite_overview_section_in_markdown_file(file_path, md_str, logger=None):
4796+
if logger is None:
4797+
logger = module_logger
4798+
elif isinstance(logger, str):
4799+
logger = get_logger(logger)
4800+
if os.path.isfile(file_path):
4801+
msg = "Updated"
4802+
with open(file_path, "r", encoding="utf-8") as f:
4803+
lines = f.readlines()
4804+
else:
4805+
msg = "Created"
4806+
lines = []
4807+
# in case the README.md exists, everything from the line including '# Overview' (or last line otherwise) is
4808+
# overwritten
4809+
with open(file_path, "w", encoding="utf-8") as f:
4810+
for line in lines:
4811+
if "# Overview" in line:
4812+
break
4813+
f.write(line)
4814+
else:
4815+
f.write("\n\n")
4816+
f.write(md_str)
4817+
logger.info(f"{msg} {file_path}")
4818+
4819+
47924820
def write_markdown(metadata_df: pd.DataFrame, file_path: str, logger=None) -> None:
47934821
"""
47944822
Write a subset of the DataFrame ``metadata_df`` to ``path`` in markdown format. If the file exists, it will be
@@ -4823,24 +4851,7 @@ def write_markdown(metadata_df: pd.DataFrame, file_path: str, logger=None) -> No
48234851
) # comes with a first-level heading which we turn into second-level
48244852
md_table += "\n\n*Overview table automatically updated using [ms3](https://ms3.readthedocs.io/).*\n"
48254853

4826-
if os.path.isfile(file_path):
4827-
msg = "Updated"
4828-
with open(file_path, "r", encoding="utf-8") as f:
4829-
lines = f.readlines()
4830-
else:
4831-
msg = "Created"
4832-
lines = []
4833-
# in case the README.md exists, everything from the line including '# Overview' (or last line otherwise) is
4834-
# overwritten
4835-
with open(file_path, "w", encoding="utf-8") as f:
4836-
for line in lines:
4837-
if "# Overview" in line:
4838-
break
4839-
f.write(line)
4840-
else:
4841-
f.write("\n\n")
4842-
f.write(md_table)
4843-
logger.info(f"{msg} {file_path}")
4854+
overwrite_overview_section_in_markdown_file(file_path, md_table, logger)
48444855

48454856

48464857
def prepare_metadata_for_writing(metadata_df):
@@ -6895,3 +6906,104 @@ def write_soup_to_mscx_file(
68956906

68966907

68976908
# endregion Functions for writing BeautifulSoup to MSCX file
6909+
# region concatenating sub-corpus metadata
6910+
6911+
6912+
def concat_metadata_tsv_files(path: str) -> pd.DataFrame:
6913+
"""Walk through the first level of subdirectories and concatenate their metadata.tsv files."""
6914+
_, folders, _ = next(os.walk(path))
6915+
tsv_paths, keys = [], []
6916+
for subdir in sorted(folders):
6917+
potential = os.path.join(path, subdir, "metadata.tsv")
6918+
if os.path.isfile(potential):
6919+
tsv_paths.append(potential)
6920+
keys.append(subdir)
6921+
if len(tsv_paths) == 0:
6922+
return pd.DataFrame()
6923+
dfs = [pd.read_csv(tsv_path, sep="\t", dtype="string") for tsv_path in tsv_paths]
6924+
try:
6925+
concatenated = pd.concat(dfs, keys=keys)
6926+
except AssertionError:
6927+
info = "Levels: " + ", ".join(
6928+
f"{key}: {df.index.nlevels} ({df.index.names})"
6929+
for key, df in zip(keys, dfs)
6930+
)
6931+
print(f"Concatenation of DataFrames failed due to an alignment error. {info}")
6932+
raise
6933+
try:
6934+
rel_path_col = next(
6935+
col for col in ("subdirectory", "rel_paths") if col in concatenated.columns
6936+
)
6937+
except StopIteration:
6938+
raise ValueError(
6939+
"Metadata is expected to come with a column called 'subdirectory' or (previously) 'rel_paths'."
6940+
)
6941+
rel_paths = [
6942+
os.path.join(corpus, rel_path)
6943+
for corpus, rel_path in zip(
6944+
concatenated.index.get_level_values(0), concatenated[rel_path_col].values
6945+
)
6946+
]
6947+
concatenated.loc[:, rel_path_col] = rel_paths
6948+
if "rel_path" in concatenated.columns:
6949+
rel_paths = [
6950+
os.path.join(corpus, rel_path)
6951+
for corpus, rel_path in zip(
6952+
concatenated.index.get_level_values(0), concatenated.rel_path.values
6953+
)
6954+
]
6955+
concatenated.loc[:, "rel_path"] = rel_paths
6956+
concatenated = concatenated.droplevel(1)
6957+
concatenated.index.rename("corpus", inplace=True)
6958+
return concatenated
6959+
6960+
6961+
def concatenated_metadata2markdown(concatenated):
6962+
try:
6963+
fname_col = next(
6964+
col for col in ("piece", "fname", "fnames") if col in concatenated.columns
6965+
)
6966+
except StopIteration:
6967+
raise ValueError(
6968+
"Metadata is expected to come with a column called 'piece' or (previously) 'fname' or 'fnames'."
6969+
)
6970+
rename4markdown = {
6971+
fname_col: "file_name",
6972+
"last_mn": "measures",
6973+
"label_count": "labels",
6974+
"harmony_version": "standard",
6975+
}
6976+
concatenated = concatenated.rename(columns=rename4markdown)
6977+
existing_columns = [
6978+
col for col in rename4markdown.values() if col in concatenated.columns
6979+
]
6980+
result = "# Overview"
6981+
for corpus_name, df in concatenated[existing_columns].groupby(level=0):
6982+
heading = f"\n\n## {corpus_name}\n\n"
6983+
md = str(dataframe2markdown(df.fillna("")))
6984+
result += heading + md
6985+
return result
6986+
6987+
6988+
def concat_metadata(
6989+
meta_corpus_dir: str, out: str, tsv_name="concatenated_metadata.tsv", logger=None
6990+
):
6991+
"""Concatenate metadata.tsv files from the sub-corpora of a meta-corpus, adapt the file paths, update the README."""
6992+
if logger is None:
6993+
logger = module_logger
6994+
elif isinstance(logger, str):
6995+
logger = get_logger(logger)
6996+
concatenated = concat_metadata_tsv_files(meta_corpus_dir)
6997+
if len(concatenated) == 0:
6998+
print(f"No metadata found in the child directories of {meta_corpus_dir}.")
6999+
return
7000+
tsv_path = os.path.join(out, tsv_name)
7001+
write_tsv(concatenated, tsv_path)
7002+
md_str = concatenated_metadata2markdown(concatenated)
7003+
md_path = os.path.join(out, "README.md")
7004+
overwrite_overview_section_in_markdown_file(
7005+
file_path=md_path, md_str=md_str, logger=logger
7006+
)
7007+
7008+
7009+
# endregion concatenating sub-corpus metadata

0 commit comments

Comments
 (0)