Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] : Implementation of read_archive function #1438

Open
wants to merge 2 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
2 changes: 1 addition & 1 deletion README.md
222 changes: 222 additions & 0 deletions janitor/functions/read_archive.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the contents of this file should be moved under io, and they do not need the @pf.register_dataframe_method decorator either.

Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
from __future__ import annotations

import tarfile
import zipfile

import pandas as pd
import pandas_flavor as pf

from janitor.utils import check


@pf.register_dataframe_method
def read_archive(
file_path: str,
extract_to_df: bool = True,
file_type: str | None = None,
) -> pd.DataFrame | list[str]:
"""
Reads an archive file (.zip, .tar, .tar.gz) and optionally lists its content
or extracts specific files into a DataFrame.

Examples:
>>> # Example usage
>>> df = pd.read_archive("data.zip", extract_to_df=True)

Args:
file_path: The path to the archive file.
extract_to_df: Whether to read the contents into a DataFrame
(for CSV or similar formats). Default is True.
file_type: Optional file type hint ('zip', 'tar', 'tar.gz').
If None, it will be inferred from the file extension.

Returns:
- A pandas DataFrame if extract_to_df is True
and the user selects a file to load.
- A list of compatible file names in the archive otherwise.
"""

check("file_path", file_path, [str])
check("extract_to_df", extract_to_df, [bool])

file_type = file_type or _infer_file_type(file_path)

if file_type == "zip":
return _process_zip_archive(file_path, extract_to_df)
elif file_type in {"tar", "tar.gz"}:
return _process_tar_archive(file_path, extract_to_df)
else:
raise ValueError(
"Unsupported archive format.Supported formats are .zip, .tar, or .tar.gz."
)


def _infer_file_type(file_path: str) -> str:
"""
Infer the type of the archive based on the file extension.

Args:
file_path: Path to the file.

Returns:
A string representing the archive type ('zip', 'tar', 'tar.gz').

Raises:
ValueError if the file extension is unsupported.
"""
if file_path.endswith(".zip"):
return "zip"
elif file_path.endswith((".tar", ".tar.gz")):
return "tar.gz" if file_path.endswith(".tar.gz") else "tar"
else:
raise ValueError(
"Cannot infer file type from the file extension. "
"Please specify the 'file_type' parameter."
)


def _process_zip_archive(
file_path: str, extract_to_df: bool
) -> pd.DataFrame | list[str]:
"""
Process a ZIP archive.

Args:
file_path: Path to the ZIP file.
extract_to_df: Whether to extract the content into a DataFrame.

Returns:
A DataFrame or a list of files in the archive.
"""
with zipfile.ZipFile(file_path) as archive:
compatible_files = _list_compatible_files(archive.namelist())

if extract_to_df:
return _select_and_extract_from_zip(archive, compatible_files)
return compatible_files


def _process_tar_archive(
file_path: str, extract_to_df: bool
) -> pd.DataFrame | list[str]:
"""
Process a TAR archive.

Args:
file_path: Path to the TAR file.
extract_to_df: Whether to extract the content into a DataFrame.

Returns:
A DataFrame or a list of files in the archive.
"""
mode = "r:gz" if file_path.endswith(".gz") else "r"
with tarfile.open(file_path, mode) as archive:
compatible_files = _list_compatible_files(archive.getnames())

if extract_to_df:
return _select_and_extract_from_tar(archive, compatible_files)
return compatible_files


def _list_compatible_files(file_names: list[str]) -> list[str]:
"""
Helper function to list compatible files (e.g., .csv, .xlsx) from an archive.

Args:
file_names: List of file names in the archive.

Returns:
List of compatible file names.
"""
compatible_files = [
file_name
for file_name in file_names
if file_name.endswith((".csv", ".xlsx"))
]
print("Fichiers compatibles détectés :", compatible_files)
if not compatible_files:
raise ValueError("No compatible files found in the archive.")
return compatible_files


def _select_and_extract_from_zip(
archive: zipfile.ZipFile, compatible_files: list[str]
) -> pd.DataFrame | list[pd.DataFrame]:
"""
Helper function to allow the user to select
and read specific files from a ZIP archive.

Args:
archive: The ZIP archive object.
compatible_files: List of compatible file names.

Returns:
A single DataFrame or a list of DataFrames.
"""
selected_files = _select_files_interactively(compatible_files)
dfs = []
for selected_file in selected_files:
with archive.open(selected_file) as file:
if selected_file.endswith(".csv"):
dfs.append(pd.read_csv(file))
elif selected_file.endswith(".xlsx"):
dfs.append(pd.read_excel(file))
return dfs if len(dfs) > 1 else dfs[0]


def _select_and_extract_from_tar(
archive: tarfile.TarFile, compatible_files: list[str]
) -> pd.DataFrame | list[pd.DataFrame]:
"""
Helper function to allow the user to select
and read specific files from a TAR archive.

Args:
archive: The TAR archive object.
compatible_files: List of compatible file names.

Returns:
A single DataFrame or a list of DataFrames.
"""
selected_files = _select_files_interactively(compatible_files)
dfs = []
for selected_file in selected_files:
member = archive.getmember(selected_file)
with archive.extractfile(member) as file:
if selected_file.endswith(".csv"):
dfs.append(pd.read_csv(file))
elif selected_file.endswith(".xlsx"):
dfs.append(pd.read_excel(file))
return dfs if len(dfs) > 1 else dfs[0]


def _select_files_interactively(compatible_files: list[str]) -> list[str]:
"""
Allow the user to select files from a list interactively.

Args:
compatible_files: List of compatible file names.

Returns:
List of selected file names.
"""
print("Compatible files found in the archive:")
for idx, file_name in enumerate(compatible_files, 1):
print(f"{idx}. {file_name}")

selected_indices = (
input(
"Enter the numbers of the files to read, "
"separated by commas (e.g., 1,2,3): "
)
.strip()
.split(",")
)
selected_files = [
compatible_files[int(idx) - 1]
for idx in selected_indices
if idx.strip().isdigit() and 0 < int(idx) <= len(compatible_files)
]
if not selected_files:
raise ValueError("No valid files selected.")
return selected_files
2 changes: 1 addition & 1 deletion janitor/spark/backend.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file shouldn't be changed, IMO, based on what I see in this PR.

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from functools import wraps

try:
from pyspark.pandas.extensions import register_dataframe_accessor
from pandas.api.extensions import register_dataframe_accessor

except ImportError:
from janitor.utils import import_message
Expand Down
2 changes: 1 addition & 1 deletion mkdocs/AUTHORS.md
2 changes: 1 addition & 1 deletion mkdocs/CHANGELOG.md
3 changes: 3 additions & 0 deletions test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
col1,col2
1,2
3,4
5 changes: 3 additions & 2 deletions tests/functions/test_complete.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Sabrina-Hassaim it feels like the changes in here weren't necessary for testing read_archive functionality, is that right? Could you revert these please?

Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,8 @@ def test_complete_multiple_groupings():
fill_value={"tag_count": 0},
sort=True,
).astype({"tag_count": int})
assert_frame_equal(result, output3)
print(result)
assert_frame_equal(result, output3, check_dtype=False)


def test_fill_value_scalar(taxonomy_df):
Expand All @@ -451,7 +452,7 @@ def test_fill_value_scalar(taxonomy_df):
.sort_values("Taxon", ignore_index=True)
)

assert_frame_equal(result, expected)
assert_frame_equal(result, expected, check_dtype=False)


# http://imachordata.com/2016/02/05/you-complete-me/
Expand Down
89 changes: 89 additions & 0 deletions tests/functions/test_read_archive.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to the contents of read_archive.py moving into io.py, I think these tests can be moved to the appropriate test file as well.

Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import io
import os
import tarfile
import zipfile

import pandas as pd
import pytest

from janitor.functions.read_archive import read_archive


# Helper function to create ZIP archives
def create_test_zip(archive_path, files):
with zipfile.ZipFile(archive_path, "w") as archive:
for file_name, content in files.items():
archive.writestr(file_name, content)


# Helper function to create TAR archives
def create_test_tar(archive_path, files):
with tarfile.open(archive_path, "w:gz") as archive:
for file_name, content in files.items():
data = content.encode("utf-8")
tarinfo = tarfile.TarInfo(name=file_name)
tarinfo.size = len(data)
archive.addfile(tarinfo, io.BytesIO(data))


# Fixture for creating a test ZIP archive
@pytest.fixture
def test_zip(tmp_path):
archive_path = tmp_path / "test.zip"
files = {"file1.csv": "col1,col2\n1,2\n3,4"}
create_test_zip(archive_path, files)
return str(archive_path)


# Fixture for creating a test TAR.GZ archive
@pytest.fixture
def test_tar(tmp_path):
archive_path = tmp_path / "test.tar.gz"
files = {"file1.csv": "col1,col2\n1,2\n3,4"}
create_test_tar(archive_path, files)
return str(archive_path)


# Test reading a ZIP archive and extracting content to a DataFrame
def test_read_zip_archive(test_zip):
result = read_archive(test_zip, extract_to_df=True)
expected = pd.DataFrame({"col1": [1, 3], "col2": [2, 4]})
pd.testing.assert_frame_equal(result, expected)


# Test reading a TAR.GZ archive and extracting content to a DataFrame
def test_read_tar_archive(test_tar):
result = read_archive(test_tar, extract_to_df=True)
expected = pd.DataFrame({"col1": [1, 3], "col2": [2, 4]})
pd.testing.assert_frame_equal(result, expected)


# Test with an unsupported file type
def test_read_archive_invalid_type():
with pytest.raises(
ValueError,
match=(
r"Cannot infer file type from the file extension\."
r"Please specify the 'file_type' parameter\."
),
):
read_archive("invalid_file.txt")


# Test with a ZIP archive containing no compatible files
def test_read_archive_no_csv(tmp_path):
archive_path = tmp_path / "empty.zip"
create_test_zip(archive_path, {"file1.txt": "No CSV here!"})
assert os.path.exists(archive_path) # Ensure the archive exists
with pytest.raises(
ValueError, match=r"No compatible files found in the archive\."
):
read_archive(str(archive_path), extract_to_df=True)


# Test listing files in a ZIP archive without extracting
def test_read_archive_file_list(test_zip):
result = read_archive(test_zip, extract_to_df=False)
assert isinstance(result, list)
assert "file1.csv" in result
assert len(result) == 1 # Ensure only compatible files are listed
2 changes: 1 addition & 1 deletion tests/test_documentation_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_docs_general_functions_present():
# I put in a subsample of general functions.
# This can be made much more robust.
rendered_correctly = False
with open("./site/api/functions/index.html", "r+") as f:
with open("./site/api/functions/index.html", "r+", encoding="utf-8") as f:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the encoding argument necessary here?

for line in f.readlines():
if "add_columns" in line or "update_where" in line:
rendered_correctly = True
Expand Down
2 changes: 2 additions & 0 deletions tests/test_simple.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file feels superfluous, could you delete it please?

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def test_simple():
assert 1 + 1 == 2
Loading