Skip to content

Commit

Permalink
fix: Remove "time" field from dataset if no time column exists (#429)
Browse files Browse the repository at this point in the history
* fix: Remove "time" field from dataset if no time column exists

* fix: Enhance datetime handling by conditionally combining date and time columns

* test: Add unit test for to_json

* test: rm blank line in test_to_json method, fix flake8
  • Loading branch information
osundwajeff authored Feb 19, 2025
1 parent dad4f7e commit bdc4457
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 5 deletions.
19 changes: 14 additions & 5 deletions django_project/gap/providers/observation.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,11 +584,20 @@ def to_json(self):
}
# Convert query results to a DataFrame
df = self.conn.sql(self.query).df()
# Combine date and time columns
df['datetime'] = pd.to_datetime(
df['date'].dt.strftime('%Y-%m-%d') + ' ' + df['time']
)
df = df.drop(columns=['date', 'time', 'lat', 'lon'])

if self.has_time_column and 'time' in df.columns:
# Combine date and time columns if time column exists
df['datetime'] = pd.to_datetime(
df['date'].dt.strftime('%Y-%m-%d') + ' ' + df['time']
)
drop_columns = ['date', 'time']
else:
# If dataset lacks time, only use the date
df['datetime'] = df['date']
drop_columns = ['date']
drop_columns.extend(['lat', 'lon'])
# Drop unnecessary columns safely
df = df.drop(columns=drop_columns, errors='ignore')
# Replace NaN with None
df = df.replace({np.nan: None})
output['data'] = df.to_dict(orient="records")
Expand Down
51 changes: 51 additions & 0 deletions django_project/gap/tests/providers/test_observation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import duckdb
import xarray as xr
import pandas as pd
import numpy as np

from django.test import TestCase
from datetime import datetime
Expand Down Expand Up @@ -892,3 +893,53 @@ def test_to_netcdf_drops_station_id_and_sets_index(
# Ensure NetCDF file was saved
mock_s3_storage.save.assert_called_once()
self.assertEqual(netcdf_output, "s3://test-bucket/output.nc")

@patch("gap.providers.observation.duckdb.connect")
def test_to_json(self, mock_duckdb_connect):
"""Test to_json handles NaN values and removes unnecessary columns."""
# Mock DuckDB connection
mock_conn = MagicMock()
mock_duckdb_connect.return_value = mock_conn

# Mock SQL query result
mock_conn.sql.return_value.df.return_value = pd.DataFrame({
"date": pd.date_range(start="2023-01-01", periods=3),
"time": ["12:00:00", "14:00:00", None], # Some missing times
"lat": [0.5, 0.6, None], # Drop lat
"lon": [36.5, None, 36.7], # Drop lon
"value": [100, np.nan, 300] # Include NaN
})

# Create reader instance
location_input = DatasetReaderInput.from_point(Point(36.8, -1.3))
reader_value = ObservationParquetReaderValue(
mock_conn,
location_input,
[],
datetime(2023, 1, 1),
datetime(2023, 1, 3),
"SELECT * FROM test"
)

# Mock has_time_column to avoid modifying it directly
with patch.object(
ObservationParquetReaderValue,
"has_time_column",
return_value=True
):
output = reader_value.to_json()

# Ensure 'data' is present
self.assertIn("data", output)
self.assertEqual(len(output["data"]), 3)

# Ensure 'datetime' is present and formatted
for entry in output["data"]:
self.assertIn("datetime", entry)
self.assertNotIn("date", entry) # Ensure 'date' was removed
self.assertNotIn("time", entry) # Ensure 'time' was merged
self.assertNotIn("lat", entry) # Ensure 'lat' was removed
self.assertNotIn("lon", entry) # Ensure 'lon' was removed

# Validate NaN conversion to None
self.assertIsNone(output["data"][1].get("value"))

0 comments on commit bdc4457

Please sign in to comment.