Orbits with null covariances cannot be loaded directly from parquet files #71

moeyensj · 2023-09-27T21:08:00Z

To reproduce the issue:

from adam_core.orbits import Orbits
from adam_core.utils.helpers import make_real_orbits

orbits = make_real_orbits()
orbits = orbits.set_column("coordinates.covariance.values", None)
orbits.to_parquet("orbits.parquet")

orbits = Orbits.from_parquet("orbits.parquet")

Which raises a ArrowInvalidError:

---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
Cell In[6], line 1
----> 1 orbits.from_parquet("test.parquet")

File ~/software/anaconda3/envs/thor_py310/lib/python3.10/site-packages/quivr/tables.py:797, in Table.from_parquet(cls, path, memory_map, pq_buffer_size, filters, column_name_map, validate, **kwargs)
    765 @classmethod
    766 def from_parquet(
    767     cls,
   (...)
    774     **kwargs: AttributeValueType,
    775 ) -> Self:
    776     """Read a table from a Parquet file.
    777 
    778     :param path: The path to the Parquet file.
   (...)
    795 
    796     """
--> 797     table = cls._load_parquet_table(
    798         path=path,
    799         memory_map=memory_map,
    800         pq_buffer_size=pq_buffer_size,
    801         filters=filters,
    802         column_name_map=column_name_map,
    803     )
    804     return cls.from_pyarrow(table=table, validate=validate, permit_nulls=False, **kwargs)

File ~/software/anaconda3/envs/thor_py310/lib/python3.10/site-packages/quivr/tables.py:830, in Table._load_parquet_table(cls, path, memory_map, pq_buffer_size, filters, column_name_map)
    827     column_names = [field.name for field in cls.schema]
    828     schema = cls.schema
--> 830 table = pyarrow.parquet.read_table(
    831     source=path,
    832     columns=column_names,
    833     memory_map=memory_map,
    834     buffer_size=pq_buffer_size,
    835     filters=filters,
    836     schema=schema,
    837 )
    838 md = pyarrow.parquet.read_metadata(path, memory_map=memory_map)
    839 table = table.replace_schema_metadata(md.metadata)

File ~/software/anaconda3/envs/thor_py310/lib/python3.10/site-packages/pyarrow/parquet/core.py:3002, in read_table(source, columns, use_threads, metadata, schema, use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning, filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit)
   2991         # TODO test that source is not a directory or a list
   2992         dataset = ParquetFile(
   2993             source, metadata=metadata, read_dictionary=read_dictionary,
   2994             memory_map=memory_map, buffer_size=buffer_size,
   (...)
   2999             thrift_container_size_limit=thrift_container_size_limit,
   3000         )
-> 3002     return dataset.read(columns=columns, use_threads=use_threads,
   3003                         use_pandas_metadata=use_pandas_metadata)
   3005 warnings.warn(
   3006     "Passing 'use_legacy_dataset=True' to get the legacy behaviour is "
   3007     "deprecated as of pyarrow 8.0.0, and the legacy implementation will "
   3008     "be removed in a future version.",
   3009     FutureWarning, stacklevel=2)
   3011 if ignore_prefixes is not None:

File ~/software/anaconda3/envs/thor_py310/lib/python3.10/site-packages/pyarrow/parquet/core.py:2630, in _ParquetDatasetV2.read(self, columns, use_threads, use_pandas_metadata)
   2622         index_columns = [
   2623             col for col in _get_pandas_index_columns(metadata)
   2624             if not isinstance(col, dict)
   2625         ]
   2626         columns = (
   2627             list(columns) + list(set(index_columns) - set(columns))
   2628         )
-> 2630 table = self._dataset.to_table(
   2631     columns=columns, filter=self._filter_expression,
   2632     use_threads=use_threads
   2633 )
   2635 # if use_pandas_metadata, restore the pandas metadata (which gets
   2636 # lost if doing a specific `columns` selection in to_table)
   2637 if use_pandas_metadata:

File ~/software/anaconda3/envs/thor_py310/lib/python3.10/site-packages/pyarrow/_dataset.pyx:556, in pyarrow._dataset.Dataset.to_table()

File ~/software/anaconda3/envs/thor_py310/lib/python3.10/site-packages/pyarrow/_dataset.pyx:3638, in pyarrow._dataset.Scanner.to_table()

File ~/software/anaconda3/envs/thor_py310/lib/python3.10/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~/software/anaconda3/envs/thor_py310/lib/python3.10/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()

ArrowInvalid: Expected all lists to be of size=36 but index 1 had size=0

spenczar · 2023-09-27T21:16:14Z

This is ultimately caused by apache/arrow#35692.

moeyensj mentioned this issue Sep 27, 2023

No more custom dataframes #72

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Orbits with null covariances cannot be loaded directly from parquet files #71

Orbits with null covariances cannot be loaded directly from parquet files #71

moeyensj commented Sep 27, 2023 •

edited

Loading

spenczar commented Sep 27, 2023

Orbits with null covariances cannot be loaded directly from parquet files #71

Orbits with null covariances cannot be loaded directly from parquet files #71

Comments

moeyensj commented Sep 27, 2023 • edited Loading

spenczar commented Sep 27, 2023

moeyensj commented Sep 27, 2023 •

edited

Loading