-
Notifications
You must be signed in to change notification settings - Fork 173
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ENH] : Implementation of read_archive function #1438
base: dev
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
mkdocs/devguide.md | ||
mkdocs/devguide.md |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
mkdocs/index.md | ||
mkdocs/index.md |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,222 @@ | ||
from __future__ import annotations | ||
|
||
import tarfile | ||
import zipfile | ||
|
||
import pandas as pd | ||
import pandas_flavor as pf | ||
|
||
from janitor.utils import check | ||
|
||
|
||
@pf.register_dataframe_method | ||
def read_archive( | ||
file_path: str, | ||
extract_to_df: bool = True, | ||
file_type: str | None = None, | ||
) -> pd.DataFrame | list[str]: | ||
""" | ||
Reads an archive file (.zip, .tar, .tar.gz) and optionally lists its content | ||
or extracts specific files into a DataFrame. | ||
|
||
Examples: | ||
>>> # Example usage | ||
>>> df = pd.read_archive("data.zip", extract_to_df=True) | ||
|
||
Args: | ||
file_path: The path to the archive file. | ||
extract_to_df: Whether to read the contents into a DataFrame | ||
(for CSV or similar formats). Default is True. | ||
file_type: Optional file type hint ('zip', 'tar', 'tar.gz'). | ||
If None, it will be inferred from the file extension. | ||
|
||
Returns: | ||
- A pandas DataFrame if extract_to_df is True | ||
and the user selects a file to load. | ||
- A list of compatible file names in the archive otherwise. | ||
""" | ||
|
||
check("file_path", file_path, [str]) | ||
check("extract_to_df", extract_to_df, [bool]) | ||
|
||
file_type = file_type or _infer_file_type(file_path) | ||
|
||
if file_type == "zip": | ||
return _process_zip_archive(file_path, extract_to_df) | ||
elif file_type in {"tar", "tar.gz"}: | ||
return _process_tar_archive(file_path, extract_to_df) | ||
else: | ||
raise ValueError( | ||
"Unsupported archive format.Supported formats are .zip, .tar, or .tar.gz." | ||
) | ||
|
||
|
||
def _infer_file_type(file_path: str) -> str: | ||
""" | ||
Infer the type of the archive based on the file extension. | ||
|
||
Args: | ||
file_path: Path to the file. | ||
|
||
Returns: | ||
A string representing the archive type ('zip', 'tar', 'tar.gz'). | ||
|
||
Raises: | ||
ValueError if the file extension is unsupported. | ||
""" | ||
if file_path.endswith(".zip"): | ||
return "zip" | ||
elif file_path.endswith((".tar", ".tar.gz")): | ||
return "tar.gz" if file_path.endswith(".tar.gz") else "tar" | ||
else: | ||
raise ValueError( | ||
"Cannot infer file type from the file extension. " | ||
"Please specify the 'file_type' parameter." | ||
) | ||
|
||
|
||
def _process_zip_archive( | ||
file_path: str, extract_to_df: bool | ||
) -> pd.DataFrame | list[str]: | ||
""" | ||
Process a ZIP archive. | ||
|
||
Args: | ||
file_path: Path to the ZIP file. | ||
extract_to_df: Whether to extract the content into a DataFrame. | ||
|
||
Returns: | ||
A DataFrame or a list of files in the archive. | ||
""" | ||
with zipfile.ZipFile(file_path) as archive: | ||
compatible_files = _list_compatible_files(archive.namelist()) | ||
|
||
if extract_to_df: | ||
return _select_and_extract_from_zip(archive, compatible_files) | ||
return compatible_files | ||
|
||
|
||
def _process_tar_archive( | ||
file_path: str, extract_to_df: bool | ||
) -> pd.DataFrame | list[str]: | ||
""" | ||
Process a TAR archive. | ||
|
||
Args: | ||
file_path: Path to the TAR file. | ||
extract_to_df: Whether to extract the content into a DataFrame. | ||
|
||
Returns: | ||
A DataFrame or a list of files in the archive. | ||
""" | ||
mode = "r:gz" if file_path.endswith(".gz") else "r" | ||
with tarfile.open(file_path, mode) as archive: | ||
compatible_files = _list_compatible_files(archive.getnames()) | ||
|
||
if extract_to_df: | ||
return _select_and_extract_from_tar(archive, compatible_files) | ||
return compatible_files | ||
|
||
|
||
def _list_compatible_files(file_names: list[str]) -> list[str]: | ||
""" | ||
Helper function to list compatible files (e.g., .csv, .xlsx) from an archive. | ||
|
||
Args: | ||
file_names: List of file names in the archive. | ||
|
||
Returns: | ||
List of compatible file names. | ||
""" | ||
compatible_files = [ | ||
file_name | ||
for file_name in file_names | ||
if file_name.endswith((".csv", ".xlsx")) | ||
] | ||
print("Fichiers compatibles détectés :", compatible_files) | ||
if not compatible_files: | ||
raise ValueError("No compatible files found in the archive.") | ||
return compatible_files | ||
|
||
|
||
def _select_and_extract_from_zip( | ||
archive: zipfile.ZipFile, compatible_files: list[str] | ||
) -> pd.DataFrame | list[pd.DataFrame]: | ||
""" | ||
Helper function to allow the user to select | ||
and read specific files from a ZIP archive. | ||
|
||
Args: | ||
archive: The ZIP archive object. | ||
compatible_files: List of compatible file names. | ||
|
||
Returns: | ||
A single DataFrame or a list of DataFrames. | ||
""" | ||
selected_files = _select_files_interactively(compatible_files) | ||
dfs = [] | ||
for selected_file in selected_files: | ||
with archive.open(selected_file) as file: | ||
if selected_file.endswith(".csv"): | ||
dfs.append(pd.read_csv(file)) | ||
elif selected_file.endswith(".xlsx"): | ||
dfs.append(pd.read_excel(file)) | ||
return dfs if len(dfs) > 1 else dfs[0] | ||
|
||
|
||
def _select_and_extract_from_tar( | ||
archive: tarfile.TarFile, compatible_files: list[str] | ||
) -> pd.DataFrame | list[pd.DataFrame]: | ||
""" | ||
Helper function to allow the user to select | ||
and read specific files from a TAR archive. | ||
|
||
Args: | ||
archive: The TAR archive object. | ||
compatible_files: List of compatible file names. | ||
|
||
Returns: | ||
A single DataFrame or a list of DataFrames. | ||
""" | ||
selected_files = _select_files_interactively(compatible_files) | ||
dfs = [] | ||
for selected_file in selected_files: | ||
member = archive.getmember(selected_file) | ||
with archive.extractfile(member) as file: | ||
if selected_file.endswith(".csv"): | ||
dfs.append(pd.read_csv(file)) | ||
elif selected_file.endswith(".xlsx"): | ||
dfs.append(pd.read_excel(file)) | ||
return dfs if len(dfs) > 1 else dfs[0] | ||
|
||
|
||
def _select_files_interactively(compatible_files: list[str]) -> list[str]: | ||
""" | ||
Allow the user to select files from a list interactively. | ||
|
||
Args: | ||
compatible_files: List of compatible file names. | ||
|
||
Returns: | ||
List of selected file names. | ||
""" | ||
print("Compatible files found in the archive:") | ||
for idx, file_name in enumerate(compatible_files, 1): | ||
print(f"{idx}. {file_name}") | ||
|
||
selected_indices = ( | ||
input( | ||
"Enter the numbers of the files to read, " | ||
"separated by commas (e.g., 1,2,3): " | ||
) | ||
.strip() | ||
.split(",") | ||
) | ||
selected_files = [ | ||
compatible_files[int(idx) - 1] | ||
for idx in selected_indices | ||
if idx.strip().isdigit() and 0 < int(idx) <= len(compatible_files) | ||
] | ||
if not selected_files: | ||
raise ValueError("No valid files selected.") | ||
return selected_files |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file shouldn't be changed, IMO, based on what I see in this PR. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
../AUTHORS.md | ||
../AUTHORS.md |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
../CHANGELOG.md | ||
../CHANGELOG.md |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
col1,col2 | ||
1,2 | ||
3,4 |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Sabrina-Hassaim it feels like the changes in here weren't necessary for testing |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to the contents of |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import io | ||
import os | ||
import tarfile | ||
import zipfile | ||
|
||
import pandas as pd | ||
import pytest | ||
|
||
from janitor.functions.read_archive import read_archive | ||
|
||
|
||
# Helper function to create ZIP archives | ||
def create_test_zip(archive_path, files): | ||
with zipfile.ZipFile(archive_path, "w") as archive: | ||
for file_name, content in files.items(): | ||
archive.writestr(file_name, content) | ||
|
||
|
||
# Helper function to create TAR archives | ||
def create_test_tar(archive_path, files): | ||
with tarfile.open(archive_path, "w:gz") as archive: | ||
for file_name, content in files.items(): | ||
data = content.encode("utf-8") | ||
tarinfo = tarfile.TarInfo(name=file_name) | ||
tarinfo.size = len(data) | ||
archive.addfile(tarinfo, io.BytesIO(data)) | ||
|
||
|
||
# Fixture for creating a test ZIP archive | ||
@pytest.fixture | ||
def test_zip(tmp_path): | ||
archive_path = tmp_path / "test.zip" | ||
files = {"file1.csv": "col1,col2\n1,2\n3,4"} | ||
create_test_zip(archive_path, files) | ||
return str(archive_path) | ||
|
||
|
||
# Fixture for creating a test TAR.GZ archive | ||
@pytest.fixture | ||
def test_tar(tmp_path): | ||
archive_path = tmp_path / "test.tar.gz" | ||
files = {"file1.csv": "col1,col2\n1,2\n3,4"} | ||
create_test_tar(archive_path, files) | ||
return str(archive_path) | ||
|
||
|
||
# Test reading a ZIP archive and extracting content to a DataFrame | ||
def test_read_zip_archive(test_zip): | ||
result = read_archive(test_zip, extract_to_df=True) | ||
expected = pd.DataFrame({"col1": [1, 3], "col2": [2, 4]}) | ||
pd.testing.assert_frame_equal(result, expected) | ||
|
||
|
||
# Test reading a TAR.GZ archive and extracting content to a DataFrame | ||
def test_read_tar_archive(test_tar): | ||
result = read_archive(test_tar, extract_to_df=True) | ||
expected = pd.DataFrame({"col1": [1, 3], "col2": [2, 4]}) | ||
pd.testing.assert_frame_equal(result, expected) | ||
|
||
|
||
# Test with an unsupported file type | ||
def test_read_archive_invalid_type(): | ||
with pytest.raises( | ||
ValueError, | ||
match=( | ||
r"Cannot infer file type from the file extension\." | ||
r"Please specify the 'file_type' parameter\." | ||
), | ||
): | ||
read_archive("invalid_file.txt") | ||
|
||
|
||
# Test with a ZIP archive containing no compatible files | ||
def test_read_archive_no_csv(tmp_path): | ||
archive_path = tmp_path / "empty.zip" | ||
create_test_zip(archive_path, {"file1.txt": "No CSV here!"}) | ||
assert os.path.exists(archive_path) # Ensure the archive exists | ||
with pytest.raises( | ||
ValueError, match=r"No compatible files found in the archive\." | ||
): | ||
read_archive(str(archive_path), extract_to_df=True) | ||
|
||
|
||
# Test listing files in a ZIP archive without extracting | ||
def test_read_archive_file_list(test_zip): | ||
result = read_archive(test_zip, extract_to_df=False) | ||
assert isinstance(result, list) | ||
assert "file1.csv" in result | ||
assert len(result) == 1 # Ensure only compatible files are listed |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,7 +25,7 @@ def test_docs_general_functions_present(): | |
# I put in a subsample of general functions. | ||
# This can be made much more robust. | ||
rendered_correctly = False | ||
with open("./site/api/functions/index.html", "r+") as f: | ||
with open("./site/api/functions/index.html", "r+", encoding="utf-8") as f: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the |
||
for line in f.readlines(): | ||
if "add_columns" in line or "update_where" in line: | ||
rendered_correctly = True | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file feels superfluous, could you delete it please? |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
def test_simple(): | ||
assert 1 + 1 == 2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the contents of this file should be moved under
io
, and they do not need the@pf.register_dataframe_method
decorator either.