forked from meltano/sdk
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add Parquet as a batch encoding option (meltano#2044)
* Add parquet encoding enum and dataclass * WIP * Add parquet support and tests * Write fastparquet file with fs * Change open_with argument * Update fastparquet.write * WIP * WIP * Adding s3fs as dependency * Remove s3fs * Remove fastparquet add pyarrow as dependency * Add parquet dependency * Add support for gzip and snappy compression types for parquet * Add pyarrow as a core dependency * Add numpy for python 3.7-3.11 * Add schema parsing * Change dict to Dict for parsing types * Added Batch Factory * Remove pyarrow as core dependency and wrap logic in dependency checks * Added missing quotes * Removed json schema to pyarrow schema support Don't currently have a way to support different numeric types like Decimal.decimal. Reverting to using pyarrow's schema inference. * Updated poetry.lock to add pyarrow as extra * Updated formating * Updated for readability * Added tests to account for missing pyarrow install * Addressed ambiguous type issue * Adding type ignore * Added type ignore to correct location * Update singer_sdk/batch.py * Adding back normal imports * mypy: install extras * Ignore missig pyarrow types * Move batchers to contrib modules * Increase test coverage * Fix types * Test batcher and target --------- Co-authored-by: Edgar R. M <[email protected]> Co-authored-by: Edgar Ramírez Mondragón <[email protected]>
- Loading branch information
1 parent
2289173
commit 24127d0
Showing
21 changed files
with
486 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
{ | ||
"type": "object", | ||
"properties": { | ||
"code": { "type": ["null", "string"] }, | ||
"name": { "type": ["null", "string"] } | ||
"code": { "type": ["string", "null"] }, | ||
"name": { "type": ["string", "null"] } | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Singer SDK contrib modules.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
"""JSON Lines Record Batcher.""" | ||
|
||
from __future__ import annotations | ||
|
||
import gzip | ||
import json | ||
import typing as t | ||
from uuid import uuid4 | ||
|
||
from singer_sdk.batch import BaseBatcher, lazy_chunked_generator | ||
|
||
__all__ = ["JSONLinesBatcher"] | ||
|
||
|
||
class JSONLinesBatcher(BaseBatcher): | ||
"""JSON Lines Record Batcher.""" | ||
|
||
def get_batches( | ||
self, | ||
records: t.Iterator[dict], | ||
) -> t.Iterator[list[str]]: | ||
"""Yield manifest of batches. | ||
Args: | ||
records: The records to batch. | ||
Yields: | ||
A list of file paths (called a manifest). | ||
""" | ||
sync_id = f"{self.tap_name}--{self.stream_name}-{uuid4()}" | ||
prefix = self.batch_config.storage.prefix or "" | ||
|
||
for i, chunk in enumerate( | ||
lazy_chunked_generator( | ||
records, | ||
self.batch_config.batch_size, | ||
), | ||
start=1, | ||
): | ||
filename = f"{prefix}{sync_id}-{i}.json.gz" | ||
with self.batch_config.storage.fs(create=True) as fs: | ||
# TODO: Determine compression from config. | ||
with fs.open(filename, "wb") as f, gzip.GzipFile( | ||
fileobj=f, | ||
mode="wb", | ||
) as gz: | ||
gz.writelines( | ||
(json.dumps(record, default=str) + "\n").encode() | ||
for record in chunk | ||
) | ||
file_url = fs.geturl(filename) | ||
yield [file_url] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
"""Parquet Record Batcher.""" | ||
|
||
from __future__ import annotations | ||
|
||
import typing as t | ||
from uuid import uuid4 | ||
|
||
from singer_sdk.batch import BaseBatcher, lazy_chunked_generator | ||
|
||
__all__ = ["ParquetBatcher"] | ||
|
||
|
||
class ParquetBatcher(BaseBatcher): | ||
"""Parquet Record Batcher.""" | ||
|
||
def get_batches( | ||
self, | ||
records: t.Iterator[dict], | ||
) -> t.Iterator[list[str]]: | ||
"""Yield manifest of batches. | ||
Args: | ||
records: The records to batch. | ||
Yields: | ||
A list of file paths (called a manifest). | ||
""" | ||
import pyarrow as pa | ||
import pyarrow.parquet as pq | ||
|
||
sync_id = f"{self.tap_name}--{self.stream_name}-{uuid4()}" | ||
prefix = self.batch_config.storage.prefix or "" | ||
|
||
for i, chunk in enumerate( | ||
lazy_chunked_generator( | ||
records, | ||
self.batch_config.batch_size, | ||
), | ||
start=1, | ||
): | ||
filename = f"{prefix}{sync_id}={i}.parquet" | ||
if self.batch_config.encoding.compression == "gzip": | ||
filename = f"{filename}.gz" | ||
with self.batch_config.storage.fs() as fs: | ||
with fs.open(filename, "wb") as f: | ||
pylist = list(chunk) | ||
table = pa.Table.from_pylist(pylist) | ||
if self.batch_config.encoding.compression == "gzip": | ||
pq.write_table(table, f, compression="GZIP") | ||
else: | ||
pq.write_table(table, f) | ||
|
||
file_url = fs.geturl(filename) | ||
yield [file_url] |
Oops, something went wrong.